From 6443f0eff879ac66cb1bc442ff9707d70e30dfbb Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 20 Jul 2023 16:29:17 -0400
Subject: [PATCH 001/230] v23.10

---
 .github/workflows/build.yaml                  | 16 +++++-----
 .github/workflows/pr.yaml                     | 30 +++++++++----------
 .github/workflows/test.yaml                   | 18 +++++------
 README.md                                     |  2 +-
 ci/build_docs.sh                              |  2 +-
 ci/check_style.sh                             |  2 +-
 ci/release/update-version.sh                  |  8 +++++
 .../all_cuda-118_arch-x86_64.yaml             |  8 ++---
 .../all_cuda-120_arch-x86_64.yaml             |  8 ++---
 cpp/CMakeLists.txt                            |  2 +-
 cpp/doxygen/Doxyfile                          |  4 +--
 cpp/examples/basic/CMakeLists.txt             |  2 +-
 cpp/examples/strings/CMakeLists.txt           |  2 +-
 cpp/libcudf_kafka/CMakeLists.txt              |  2 +-
 dependencies.yaml                             | 14 ++++-----
 docs/cudf/source/conf.py                      |  4 +--
 docs/dask_cudf/source/conf.py                 |  4 +--
 fetch_rapids.cmake                            |  2 +-
 java/ci/README.md                             |  4 +--
 java/pom.xml                                  |  2 +-
 java/src/main/native/CMakeLists.txt           |  2 +-
 python/cudf/CMakeLists.txt                    |  2 +-
 python/cudf/cudf/__init__.py                  |  2 +-
 python/cudf/pyproject.toml                    |  6 ++--
 python/cudf_kafka/pyproject.toml              |  4 +--
 python/custreamz/pyproject.toml               |  6 ++--
 python/dask_cudf/dask_cudf/__init__.py        |  2 +-
 python/dask_cudf/pyproject.toml               |  6 ++--
 28 files changed, 87 insertions(+), 79 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 15f3269efaf..e11a797cedd 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -91,7 +91,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f157a3b04cb..225ac91cbae 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,7 +98,7 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.10
     with:
       build_type: pull-request
       package-name: cudf
@@ -107,7 +107,7 @@ jobs:
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.10
     with:
       build_type: pull-request
       package-name: cudf
@@ -116,7 +116,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@branch-23.10
     with:
       build_type: pull-request
       package-name: dask_cudf
@@ -125,10 +125,10 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.10
     with:
       build_type: pull-request
       package-name: dask_cudf
       # Install the cudf we just built, and also test against latest dask/distributed/dask-cuda.
-      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl && python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
+      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl && python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10"
       test-unittest: "python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b5563d79466..b200904a720 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
       test-unittest: "python -m pytest -n 8 ./python/cudf/cudf/tests"
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,5 +97,5 @@ jobs:
       sha: ${{ inputs.sha }}
       package-name: dask_cudf
       # Test against latest dask/distributed/dask-cuda.
-      test-before: "python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
+      test-before: "python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10"
       test-unittest: "python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests"
diff --git a/README.md b/README.md
index b1c498a2c95..64c980d0cb3 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ cuDF can be installed with conda (via [miniconda](https://conda.io/miniconda.htm
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=23.08 python=3.10 cuda-version=11.8
+    cudf=23.10 python=3.10 cuda-version=11.8
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index e831cde08cb..52e9419c82d 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -25,7 +25,7 @@ rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
   libcudf cudf dask-cudf
 
-export RAPIDS_VERSION_NUMBER="23.08"
+export RAPIDS_VERSION_NUMBER="23.10"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 4beaca333fd..e96ad8bf1db 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -14,7 +14,7 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --force -f env.yaml -n checks
 conda activate checks
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 92627b6e83b..8a03ab2e71d 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -21,12 +21,14 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 #Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
+NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 # Need to distutils-normalize the versions for some use cases
 CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
 echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
@@ -112,3 +114,9 @@ for FILE in .github/workflows/*.yaml; do
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+
+# Java files
+NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"
+sed_runner "s|<version>.*-SNAPSHOT</version>|<version>${NEXT_FULL_JAVA_TAG}</version>|g" java/pom.xml
+sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" java/ci/README.md
+sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 71e5d129d80..7c88b112922 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - cxx-compiler
 - cython>=0.29,<0.30
 - dask-core>=2023.5.1
-- dask-cuda==23.8.*
+- dask-cuda==23.10.*
 - dask>=2023.5.1
 - distributed>=2023.5.1
 - dlpack>=0.5,<0.6.0a0
@@ -43,9 +43,9 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==23.8.*
+- libkvikio==23.10.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==23.8.*
+- librmm==23.10.*
 - mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
@@ -78,7 +78,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
-- rmm==23.8.*
+- rmm==23.10.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1
 - scipy
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index ba42f9a8165..21ea3efd8bc 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - cxx-compiler
 - cython>=0.29,<0.30
 - dask-core>=2023.5.1
-- dask-cuda==23.8.*
+- dask-cuda==23.10.*
 - dask>=2023.5.1
 - distributed>=2023.5.1
 - dlpack>=0.5,<0.6.0a0
@@ -42,9 +42,9 @@ dependencies:
 - libarrow==11.0.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==23.8.*
+- libkvikio==23.10.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==23.8.*
+- librmm==23.10.*
 - mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
@@ -75,7 +75,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
-- rmm==23.8.*
+- rmm==23.10.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1
 - scipy
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d1cf337b56e..8a19af31bf5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 23.08.00
+  VERSION 23.10.00
   LANGUAGES C CXX CUDA
 )
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index faf995cc7d0..357daed243b 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcudf"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.08.00
+PROJECT_NUMBER         = 23.10.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2162,7 +2162,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/23.08
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/23.10
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index a9479c517a3..1c1952c4616 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -16,7 +16,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-23.08)
+set(CUDF_TAG branch-23.10)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 5e9b6450553..31a6b12a4bc 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -16,7 +16,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-23.08)
+set(CUDF_TAG branch-23.10)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 37e61ac66db..33bd04fffb3 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 23.08.00
+  VERSION 23.10.00
   LANGUAGES CXX
 )
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 183451670bc..9f530e83e47 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -213,8 +213,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - librmm==23.8.*
-          - libkvikio==23.8.*
+          - librmm==23.10.*
+          - libkvikio==23.10.*
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
@@ -260,7 +260,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - scikit-build>=0.13.1
-          - rmm==23.8.*
+          - rmm==23.10.*
       - output_types: conda
         packages:
           - &protobuf protobuf>=4.21.6,<4.22
@@ -422,7 +422,7 @@ dependencies:
           - &numba numba>=0.57
           - nvtx>=0.2.1
           - packaging
-          - rmm==23.8.*
+          - rmm==23.10.*
           - typing_extensions>=4.0.0
           - *protobuf
       - output_types: conda
@@ -483,7 +483,7 @@ dependencies:
           - dask-core>=2023.5.1  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
-          - &cudf cudf==23.8.*
+          - &cudf cudf==23.10.*
           - *cupy_pip
   run_cudf_kafka:
     common:
@@ -502,7 +502,7 @@ dependencies:
         packages:
           - confluent-kafka>=1.9.0,<1.10.0a0
           - *cudf
-          - cudf_kafka==23.8.*
+          - cudf_kafka==23.10.*
   test_cpp:
     common:
       - output_types: conda
@@ -600,5 +600,5 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==23.8.*
+          - dask-cuda==23.10.*
           - *numba
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 627d99b7f04..1654750bdf8 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -81,9 +81,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.08'
+version = '23.10'
 # The full version, including alpha/beta/rc tags.
-release = '23.08.00'
+release = '23.10.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index 4b367c65762..ad629b5e949 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -11,8 +11,8 @@
 project = "dask-cudf"
 copyright = "2018-2023, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-version = '23.08'
-release = '23.08.00'
+version = '23.10'
+release = '23.10.00'
 
 language = "en"
 
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 7ef42466e9f..4a68c7dbc60 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake
   )
 endif()
diff --git a/java/ci/README.md b/java/ci/README.md
index ec3e6568b0d..e9599b33bf1 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.08
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.10
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable devtoolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-23.08.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-23.10.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 80e23a0bf54..afcc0e15a2c 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>23.08.0-SNAPSHOT</version>
+    <version>23.10.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 6845dbaec5a..128989fe77f 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -28,7 +28,7 @@ rapids_cuda_init_architectures(CUDF_JNI)
 
 project(
   CUDF_JNI
-  VERSION 23.08.00
+  VERSION 23.10.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index a6a354121d4..6f3e428d291 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cudf_version 23.08.00)
+set(cudf_version 23.10.00)
 
 include(../../fetch_rapids.cmake)
 include(rapids-cuda)
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index b05374251d4..d8cee514fb7 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -100,7 +100,7 @@
 rmm.register_reinitialize_hook(clear_cache)
 
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
 
 __all__ = [
     "BaseIndex",
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 109a2c83b46..92883f2bff3 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
     "numpy>=1.21",
     "protoc-wheel",
     "pyarrow==11.0.0.*",
-    "rmm==23.8.*",
+    "rmm==23.10.*",
     "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
@@ -17,7 +17,7 @@ requires = [
 
 [project]
 name = "cudf"
-version = "23.08.00"
+version = "23.10.00"
 description = "cuDF - GPU Dataframe"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -39,7 +39,7 @@ dependencies = [
     "protobuf>=4.21.6,<4.22",
     "ptxcompiler",
     "pyarrow==11.*",
-    "rmm==23.8.*",
+    "rmm==23.10.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 3fb04402ccb..d458969d40f 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -12,7 +12,7 @@ requires = [
 
 [project]
 name = "cudf_kafka"
-version = "23.08.00"
+version = "23.10.00"
 description = "cuDF Kafka Datasource"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -21,7 +21,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.8.*",
+    "cudf==23.10.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index c174f28b8c0..47ade91b5eb 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "custreamz"
-version = "23.08.00"
+version = "23.10.00"
 description = "cuStreamz - GPU Accelerated Streaming"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -19,8 +19,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==23.8.*",
-    "cudf_kafka==23.8.*",
+    "cudf==23.10.*",
+    "cudf_kafka==23.10.*",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index f6c92aeb49c..6952c3d5882 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -14,7 +14,7 @@
 except ImportError:
     pass
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
 
 __all__ = [
     "DataFrame",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 977608517ed..2b43a686d54 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "dask_cudf"
-version = "23.08.00"
+version = "23.10.00"
 description = "Utilities for Dask and cuDF interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.8.*",
+    "cudf==23.10.*",
     "cupy-cuda11x>=12.0.0",
     "dask>=2023.5.1",
     "distributed>=2023.5.1",
@@ -39,7 +39,7 @@ dynamic = ["entry-points"]
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==23.8.*",
+    "dask-cuda==23.10.*",
     "numba>=0.57",
     "pytest",
     "pytest-cov",

From 2a590dbb6a06eb59bdfa97976dd5b22635b6c1f9 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 24 Jul 2023 15:43:45 -0500
Subject: [PATCH 002/230] Enforce deprecations in `23.10` (#13732)

This PR enforces previously deprecated code until `23.08` in `23.10`. This PR removes `strings_to_categorical` parameter support in `read_parquet`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13732
---
 python/cudf/cudf/_lib/cpp/io/parquet.pxd      |  5 ----
 python/cudf/cudf/_lib/parquet.pyx             |  3 --
 python/cudf/cudf/io/parquet.py                | 10 -------
 python/cudf/cudf/tests/test_parquet.py        | 28 +++----------------
 python/cudf/cudf/utils/ioutils.py             |  8 ------
 python/dask_cudf/dask_cudf/io/parquet.py      | 15 ----------
 .../dask_cudf/io/tests/test_parquet.py        |  6 ----
 7 files changed, 4 insertions(+), 71 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index e2570eaa7d9..f6fa04b9c29 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -19,14 +19,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.source_info get_source_info() except +
         vector[vector[size_type]] get_row_groups() except +
         data_type get_timestamp_type() except +
-        bool is_enabled_convert_strings_to_categories() except +
         bool is_enabled_use_pandas_metadata() except +
 
         # setter
 
         void set_columns(vector[string] col_names) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
-        void enable_convert_strings_to_categories(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
@@ -46,9 +44,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& row_groups(
             vector[vector[size_type]] row_grp
         ) except +
-        parquet_reader_options_builder& convert_strings_to_categories(
-            bool val
-        ) except +
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 2c7f5df084b..7c861203d6c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -120,7 +120,6 @@ def _parse_metadata(meta):
 
 
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
-                   strings_to_categorical=False,
                    use_pandas_metadata=True):
     """
     Cython function to call into libcudf API, see `read_parquet`.
@@ -144,7 +143,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     cdef cudf_io_types.source_info source = make_source_info(
         filepaths_or_buffers)
 
-    cdef bool cpp_strings_to_categorical = strings_to_categorical
     cdef bool cpp_use_pandas_metadata = use_pandas_metadata
 
     cdef vector[vector[size_type]] cpp_row_groups
@@ -160,7 +158,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     args = move(
         parquet_reader_options.builder(source)
         .row_groups(cpp_row_groups)
-        .convert_strings_to_categories(cpp_strings_to_categorical)
         .use_pandas_metadata(cpp_use_pandas_metadata)
         .timestamp_type(cpp_timestamp_type)
         .build()
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 0dec8e1c67f..d8510cf8e95 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -437,7 +437,6 @@ def read_parquet(
     storage_options=None,
     filters=None,
     row_groups=None,
-    strings_to_categorical=False,
     use_pandas_metadata=True,
     use_python_file_object=True,
     categorical_partitions=True,
@@ -449,12 +448,6 @@ def read_parquet(
 ):
     """{docstring}"""
 
-    if strings_to_categorical is not False:
-        warnings.warn(
-            "`strings_to_categorical` is deprecated and will be removed in "
-            "a future version of cudf.",
-            FutureWarning,
-        )
     # Do not allow the user to set file-opening options
     # when `use_python_file_object=False` is specified
     if use_python_file_object is False:
@@ -578,7 +571,6 @@ def read_parquet(
         *args,
         columns=columns,
         row_groups=row_groups,
-        strings_to_categorical=strings_to_categorical,
         use_pandas_metadata=use_pandas_metadata,
         partition_keys=partition_keys,
         partition_categories=partition_categories,
@@ -809,7 +801,6 @@ def _read_parquet(
     engine,
     columns=None,
     row_groups=None,
-    strings_to_categorical=None,
     use_pandas_metadata=None,
     *args,
     **kwargs,
@@ -831,7 +822,6 @@ def _read_parquet(
             filepaths_or_buffers,
             columns=columns,
             row_groups=row_groups,
-            strings_to_categorical=strings_to_categorical,
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index cdece1397c3..f403c522f58 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -31,7 +31,6 @@
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
-    expect_warning_if,
     set_random_null_mask_inplace,
 )
 
@@ -298,8 +297,7 @@ def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine):
 
 
 @pytest.mark.parametrize("has_null", [False, True])
-@pytest.mark.parametrize("strings_to_categorical", [False, True, None])
-def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null):
+def test_parquet_reader_strings(tmpdir, has_null):
     df = pd.DataFrame(
         [(1, "aaa", 9.0), (2, "bbb", 8.0), (3, "ccc", 7.0)],
         columns=pd.Index(list("abc")),
@@ -310,28 +308,10 @@ def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null):
     df.to_parquet(fname)
     assert os.path.exists(fname)
 
-    if strings_to_categorical is not None:
-        with expect_warning_if(strings_to_categorical is not False):
-            gdf = cudf.read_parquet(
-                fname,
-                engine="cudf",
-                strings_to_categorical=strings_to_categorical,
-            )
-    else:
-        gdf = cudf.read_parquet(fname, engine="cudf")
+    gdf = cudf.read_parquet(fname, engine="cudf")
 
-    if strings_to_categorical:
-        if has_null:
-            hash_ref = [989983842, None, 1169108191]
-        else:
-            hash_ref = [989983842, 429364346, 1169108191]
-        assert gdf["b"].dtype == np.dtype("int32")
-        assert_eq(
-            gdf["b"], cudf.Series(hash_ref, dtype=np.dtype("int32"), name="b")
-        )
-    else:
-        assert gdf["b"].dtype == np.dtype("object")
-        assert_eq(gdf["b"], df["b"])
+    assert gdf["b"].dtype == np.dtype("object")
+    assert_eq(gdf["b"], df["b"])
 
 
 @pytest.mark.parametrize("columns", [None, ["b"]])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index fb8492bbf4f..91925bf3c0c 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -166,14 +166,6 @@
     If not None, specifies, for each input file, which row groups to read.
     If reading multiple inputs, a list of lists should be passed, one list
     for each input.
-strings_to_categorical : boolean, default False
-    If True, return string columns as GDF_CATEGORY dtype; if False, return a
-    as GDF_STRING dtype.
-
-    .. deprecated:: 23.08
-
-        This parameter is deprecated and will be removed in a future
-        version of cudf.
 categorical_partitions : boolean, default True
     Whether directory-partitioned columns should be interpreted as categorical
     or raw dtypes.
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 65d9fee3a8a..dd8c3394a2c 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -50,15 +50,6 @@ def _create_dd_meta(cls, dataset_info, **kwargs):
             kwargs.get("schema", None),
         )
 
-        # If `strings_to_categorical==True`, convert objects to int32
-        strings_to_cats = kwargs.get("strings_to_categorical", False)
-        for col in meta_cudf._data.names:
-            if (
-                isinstance(meta_cudf._data[col], cudf.core.column.StringColumn)
-                and strings_to_cats
-            ):
-                meta_cudf._data[col] = meta_cudf._data[col].astype("int32")
-
         return meta_cudf
 
     @classmethod
@@ -75,7 +66,6 @@ def _read_paths(
         columns=None,
         row_groups=None,
         filters=None,
-        strings_to_categorical=None,
         partitions=None,
         partitioning=None,
         partition_keys=None,
@@ -124,7 +114,6 @@ def _read_paths(
                     engine="cudf",
                     columns=columns,
                     row_groups=row_groups if row_groups else None,
-                    strings_to_categorical=strings_to_categorical,
                     dataset_kwargs=dataset_kwargs,
                     categorical_partitions=False,
                     **kwargs,
@@ -142,7 +131,6 @@ def _read_paths(
                                 row_groups=row_groups[i]
                                 if row_groups
                                 else None,
-                                strings_to_categorical=strings_to_categorical,
                                 dataset_kwargs=dataset_kwargs,
                                 categorical_partitions=False,
                                 **kwargs,
@@ -245,7 +233,6 @@ def read_partition(
             pieces = [pieces]
 
         # Extract supported kwargs from `kwargs`
-        strings_to_cats = kwargs.get("strings_to_categorical", False)
         read_kwargs = kwargs.get("read", {})
         read_kwargs.update(open_file_options or {})
         check_file_size = read_kwargs.pop("check_file_size", None)
@@ -291,7 +278,6 @@ def read_partition(
                             columns=read_columns,
                             row_groups=rgs if rgs else None,
                             filters=filters,
-                            strings_to_categorical=strings_to_cats,
                             partitions=partitions,
                             partitioning=partitioning,
                             partition_keys=last_partition_keys,
@@ -318,7 +304,6 @@ def read_partition(
                     columns=read_columns,
                     row_groups=rgs if rgs else None,
                     filters=filters,
-                    strings_to_categorical=strings_to_cats,
                     partitions=partitions,
                     partitioning=partitioning,
                     partition_keys=last_partition_keys,
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 7b9fac665c6..85ec36cf2c5 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -159,12 +159,6 @@ def test_strings(tmpdir):
     read_df = dask_cudf.read_parquet(fn, index=["a"])
     dd.assert_eq(ddf2, read_df.compute().to_pandas())
 
-    read_df_cats = dask_cudf.read_parquet(
-        fn, index=["a"], strings_to_categorical=True
-    )
-    dd.assert_eq(read_df_cats.dtypes, read_df_cats.compute().dtypes)
-    dd.assert_eq(read_df_cats.dtypes[0], "int32")
-
 
 def test_dask_timeseries_from_pandas(tmpdir):
 

From 43aca00543a60e04c1409d1c5baa1560f8cf8f9a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 25 Jul 2023 13:28:41 -0500
Subject: [PATCH 003/230] Support more numeric types in `Groupby.apply` with
 `engine='jit'` (#13729)

draft

This PR adds additional numeric dtypes to `GroupBy.apply` with `engine='jit'`.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13729
---
 python/cudf/cudf/core/udf/groupby_typing.py | 68 ++++++++++++++++-----
 python/cudf/cudf/tests/test_groupby.py      |  6 +-
 python/cudf/udf_cpp/shim.cu                 | 22 +++++++
 3 files changed, 81 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py
index 37381a95fdf..bc6a084f2b4 100644
--- a/python/cudf/cudf/core/udf/groupby_typing.py
+++ b/python/cudf/cudf/core/udf/groupby_typing.py
@@ -17,9 +17,14 @@
 
 index_default_type = types.int64
 group_size_type = types.int64
-SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64]
+SUPPORTED_GROUPBY_NUMBA_TYPES = [
+    types.int32,
+    types.int64,
+    types.float32,
+    types.float64,
+]
 SUPPORTED_GROUPBY_NUMPY_TYPES = [
-    numpy_support.as_dtype(dt) for dt in [types.int64, types.float64]
+    numpy_support.as_dtype(dt) for dt in SUPPORTED_GROUPBY_NUMBA_TYPES
 ]
 
 
@@ -133,6 +138,25 @@ def caller(data, index, size):
     call_cuda_functions[funcname.lower()][type_key] = caller
 
 
+def _make_unary_attr(funcname):
+    class GroupUnaryReductionAttrTyping(AbstractTemplate):
+        key = f"GroupType.{funcname}"
+
+        def generic(self, args, kws):
+            for retty, inputty in call_cuda_functions[funcname.lower()].keys():
+                if self.this.group_scalar_type == inputty:
+                    return nb_signature(retty, recvr=self.this)
+            return None
+
+    def _attr(self, mod):
+        return types.BoundFunction(
+            GroupUnaryReductionAttrTyping,
+            GroupType(mod.group_scalar_type, mod.index_type),
+        )
+
+    return _attr
+
+
 def _create_reduction_attr(name, retty=None):
     class Attr(AbstractTemplate):
         key = name
@@ -171,9 +195,13 @@ def generic(self, args, kws):
 class GroupAttr(AttributeTemplate):
     key = GroupType
 
-    resolve_max = _create_reduction_attr("GroupType.max")
-    resolve_min = _create_reduction_attr("GroupType.min")
-    resolve_sum = _create_reduction_attr("GroupType.sum")
+    resolve_max = _make_unary_attr("max")
+    resolve_min = _make_unary_attr("min")
+    resolve_sum = _make_unary_attr("sum")
+
+    resolve_mean = _make_unary_attr("mean")
+    resolve_var = _make_unary_attr("var")
+    resolve_std = _make_unary_attr("std")
 
     resolve_size = _create_reduction_attr(
         "GroupType.size", retty=group_size_type
@@ -181,11 +209,6 @@ class GroupAttr(AttributeTemplate):
     resolve_count = _create_reduction_attr(
         "GroupType.count", retty=types.int64
     )
-    resolve_mean = _create_reduction_attr(
-        "GroupType.mean", retty=types.float64
-    )
-    resolve_var = _create_reduction_attr("GroupType.var", retty=types.float64)
-    resolve_std = _create_reduction_attr("GroupType.std", retty=types.float64)
 
     def resolve_idxmax(self, mod):
         return types.BoundFunction(
@@ -201,13 +224,30 @@ def resolve_idxmin(self, mod):
 for ty in SUPPORTED_GROUPBY_NUMBA_TYPES:
     _register_cuda_reduction_caller("Max", ty, ty)
     _register_cuda_reduction_caller("Min", ty, ty)
-    _register_cuda_reduction_caller("Sum", ty, ty)
-    _register_cuda_reduction_caller("Mean", ty, types.float64)
-    _register_cuda_reduction_caller("Std", ty, types.float64)
-    _register_cuda_reduction_caller("Var", ty, types.float64)
     _register_cuda_idx_reduction_caller("IdxMax", ty)
     _register_cuda_idx_reduction_caller("IdxMin", ty)
 
+_register_cuda_reduction_caller("Sum", types.int32, types.int64)
+_register_cuda_reduction_caller("Sum", types.int64, types.int64)
+_register_cuda_reduction_caller("Sum", types.float32, types.float32)
+_register_cuda_reduction_caller("Sum", types.float64, types.float64)
+
+
+_register_cuda_reduction_caller("Mean", types.int32, types.float64)
+_register_cuda_reduction_caller("Mean", types.int64, types.float64)
+_register_cuda_reduction_caller("Mean", types.float32, types.float32)
+_register_cuda_reduction_caller("Mean", types.float64, types.float64)
+
+_register_cuda_reduction_caller("Std", types.int32, types.float64)
+_register_cuda_reduction_caller("Std", types.int64, types.float64)
+_register_cuda_reduction_caller("Std", types.float32, types.float32)
+_register_cuda_reduction_caller("Std", types.float64, types.float64)
+
+_register_cuda_reduction_caller("Var", types.int32, types.float64)
+_register_cuda_reduction_caller("Var", types.int64, types.float64)
+_register_cuda_reduction_caller("Var", types.float32, types.float32)
+_register_cuda_reduction_caller("Var", types.float64, types.float64)
+
 
 for attr in ("group_data", "index", "size"):
     make_attribute_wrapper(GroupType, attr, attr)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index dde80639fc7..48092be390d 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -402,7 +402,11 @@ def run_groupby_apply_jit_test(data, func, keys, *args):
     assert_groupby_results_equal(cudf_jit_result, pandas_result)
 
 
-@pytest.mark.parametrize("dtype", SUPPORTED_GROUPBY_NUMPY_TYPES)
+@pytest.mark.parametrize(
+    "dtype",
+    SUPPORTED_GROUPBY_NUMPY_TYPES,
+    ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES],
+)
 @pytest.mark.parametrize(
     "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"]
 )
diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu
index 63ad1039da6..a81c8238f76 100644
--- a/python/cudf/udf_cpp/shim.cu
+++ b/python/cudf/udf_cpp/shim.cu
@@ -630,17 +630,34 @@ extern "C" {
     return 0;                                                                                    \
   }
 
+make_definition(BlockSum, int32, int32_t, int64_t);
 make_definition(BlockSum, int64, int64_t, int64_t);
+make_definition(BlockSum, float32, float, float);
 make_definition(BlockSum, float64, double, double);
+
+make_definition(BlockMean, int32, int32_t, double);
 make_definition(BlockMean, int64, int64_t, double);
+make_definition(BlockMean, float32, float, float);
 make_definition(BlockMean, float64, double, double);
+
+make_definition(BlockStd, int32, int32_t, double);
 make_definition(BlockStd, int64, int64_t, double);
+make_definition(BlockStd, float32, float, float);
 make_definition(BlockStd, float64, double, double);
+
 make_definition(BlockVar, int64, int64_t, double);
+make_definition(BlockVar, int32, int32_t, double);
+make_definition(BlockVar, float32, float, float);
 make_definition(BlockVar, float64, double, double);
+
+make_definition(BlockMin, int32, int32_t, int32_t);
 make_definition(BlockMin, int64, int64_t, int64_t);
+make_definition(BlockMin, float32, float, float);
 make_definition(BlockMin, float64, double, double);
+
+make_definition(BlockMax, int32, int32_t, int32_t);
 make_definition(BlockMax, int64, int64_t, int64_t);
+make_definition(BlockMax, float32, float, float);
 make_definition(BlockMax, float64, double, double);
 #undef make_definition
 }
@@ -656,9 +673,14 @@ extern "C" {
     return 0;                                                                    \
   }
 
+make_definition_idx(BlockIdxMin, int32, int32_t);
 make_definition_idx(BlockIdxMin, int64, int64_t);
+make_definition_idx(BlockIdxMin, float32, float);
 make_definition_idx(BlockIdxMin, float64, double);
+
+make_definition_idx(BlockIdxMax, int32, int32_t);
 make_definition_idx(BlockIdxMax, int64, int64_t);
+make_definition_idx(BlockIdxMax, float32, float);
 make_definition_idx(BlockIdxMax, float64, double);
 #undef make_definition_idx
 }

From 427f8792e04662afeccad3beaae593817c52079f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 26 Jul 2023 13:28:11 +0100
Subject: [PATCH 004/230] Provide our own Cython declaration for make_unique
 (#13746)

`make_unique` in Cython's libcpp headers is not annotated with `except +`. As a consequence, if the constructor throws, we do not catch it in Python. To work around this (see cython/cython#5560 for details), provide our own implementation.

Due to the way assignments occur to temporaries, we need to now explicitly wrap all calls to `make_unique` in `move`, but that is arguably preferable to not being able to catch exceptions, and will not be necessary once we move to Cython 3.

- Closes #13743

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/13746
---
 python/cudf/cudf/_lib/column.pyx            |  5 ++--
 python/cudf/cudf/_lib/concat.pyx            |  7 ++---
 python/cudf/cudf/_lib/copying.pyx           |  4 +--
 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd | 12 +++++++++
 python/cudf/cudf/_lib/expressions.pyx       | 28 ++++++++++---------
 python/cudf/cudf/_lib/join.pyx              |  7 ++---
 python/cudf/cudf/_lib/null_mask.pyx         | 13 ++++-----
 python/cudf/cudf/_lib/parquet.pyx           |  5 ++--
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx | 30 +++++++++++----------
 9 files changed, 66 insertions(+), 45 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 587ac8e7a30..2b1fc14f398 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -27,7 +27,7 @@ from cudf.utils.dtypes import _get_base_dtype
 
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -49,6 +49,7 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
@@ -275,7 +276,7 @@ cdef class Column:
                 self._children = ()
             else:
                 children = Column.from_unique_ptr(
-                    make_unique[column](self.view())
+                    move(make_unique[column](self.view()))
                 ).base_children
                 dtypes = [
                     base_child.dtype for base_child in self.base_children
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index 7872375f599..feaf75ef237 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -1,7 +1,7 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -12,6 +12,7 @@ from cudf._lib.cpp.concatenate cimport (
     concatenate_masks as libcudf_concatenate_masks,
     concatenate_tables as libcudf_concatenate_tables,
 )
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.utils cimport (
     data_from_unique_ptr,
@@ -30,7 +31,7 @@ cpdef concat_masks(object columns):
     cdef vector[column_view] c_views = make_column_views(columns)
     with nogil:
         c_result = move(libcudf_concatenate_masks(c_views))
-        c_unique_result = make_unique[device_buffer](move(c_result))
+        c_unique_result = move(make_unique[device_buffer](move(c_result)))
     return as_buffer(
         DeviceBuffer.c_from_unique_ptr(move(c_unique_result))
     )
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 70f27e16a3a..944a80158df 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -4,7 +4,7 @@ import pickle
 
 from libc.stdint cimport int32_t, uint8_t, uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr
+from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -29,6 +29,7 @@ cimport cudf._lib.cpp.copying as cpp_copying
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
 from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
@@ -80,7 +81,6 @@ def copy_column(Column input_column):
     Deep copied column
     """
     cdef unique_ptr[column] c_result
-
     cdef column_view input_column_view = input_column.view()
     with nogil:
         c_result = move(make_unique[column](input_column_view))
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
new file mode 100644
index 00000000000..2178f1a940c
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+
+cdef extern from "<memory>" namespace "std" nogil:
+    # The Cython standard header does not have except +, so C++
+    # exceptions from make_unique are not caught and translated to
+    # Python ones. This is not perfectly ergonomic, we always have to
+    # wrap make_unique in move, but at least we can catch exceptions.
+    # See https://github.com/cython/cython/issues/5560
+    unique_ptr[T] make_unique[T](...) except +
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index 64a77f25385..b02446a7c7c 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -4,9 +4,11 @@ from enum import Enum
 
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
 
 from cudf._lib.cpp cimport expressions as libcudf_exp
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.types cimport size_type
 
 # Necessary for proper casting, see below.
@@ -80,26 +82,26 @@ cdef class Literal(Expression):
     def __cinit__(self, value):
         if isinstance(value, int):
             self.c_scalar.reset(new numeric_scalar[int64_t](value, True))
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <numeric_scalar[int64_t] &>dereference(self.c_scalar)
-            )
+            ))
         elif isinstance(value, float):
             self.c_scalar.reset(new numeric_scalar[double](value, True))
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <numeric_scalar[double] &>dereference(self.c_scalar)
-            )
+            ))
         elif isinstance(value, str):
             self.c_scalar.reset(new string_scalar(value.encode(), True))
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <string_scalar &>dereference(self.c_scalar)
-            )
+            ))
 
 
 cdef class ColumnReference(Expression):
     def __cinit__(self, size_type index):
-        self.c_obj = <expression_ptr>make_unique[libcudf_exp.column_reference](
+        self.c_obj = <expression_ptr>move(make_unique[libcudf_exp.column_reference](
             index
-        )
+        ))
 
 
 cdef class Operation(Expression):
@@ -109,10 +111,10 @@ cdef class Operation(Expression):
         )
 
         if right is None:
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.operation](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
                 op_value, dereference(left.c_obj)
-            )
+            ))
         else:
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.operation](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
                 op_value, dereference(left.c_obj), dereference(right.c_obj)
-            )
+            ))
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 70667c639bb..416680aae24 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -2,7 +2,7 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
@@ -11,6 +11,7 @@ from rmm._lib.device_buffer cimport device_buffer
 cimport cudf._lib.cpp.join as cpp_join
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.utils cimport table_view_from_columns
@@ -66,8 +67,8 @@ cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
     # help to convert a gather map to a Column
     cdef device_buffer c_empty
     cdef size_type size = gather_map.get()[0].size()
-    cdef unique_ptr[column] c_col = make_unique[column](
+    cdef unique_ptr[column] c_col = move(make_unique[column](
         data_type(type_id.INT32),
         size,
-        gather_map.get()[0].release(), move(c_empty), 0)
+        gather_map.get()[0].release(), move(c_empty), 0))
     return Column.from_unique_ptr(move(c_col))
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index c41ae98b9bd..5b4538629f6 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from enum import Enum
 
@@ -6,12 +6,13 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
@@ -50,7 +51,7 @@ def copy_bitmask(Column col):
 
     with nogil:
         db = move(cpp_copy_bitmask(col_view))
-        up_db = make_unique[device_buffer](move(db))
+        up_db = move(make_unique[device_buffer](move(db)))
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
     buf = as_buffer(rmm_db)
@@ -96,7 +97,7 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
 
     with nogil:
         db = move(cpp_create_null_mask(size, c_mask_state))
-        up_db = make_unique[device_buffer](move(db))
+        up_db = move(make_unique[device_buffer](move(db)))
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
     buf = as_buffer(rmm_db)
@@ -110,7 +111,7 @@ def bitmask_and(columns: list):
     cdef unique_ptr[device_buffer] up_db
     with nogil:
         c_result = move(cpp_bitmask_and(c_view))
-        up_db = make_unique[device_buffer](move(c_result.first))
+        up_db = move(make_unique[device_buffer](move(c_result.first)))
     dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
     buf = as_buffer(dbuf)
     return buf, c_result.second
@@ -123,7 +124,7 @@ def bitmask_or(columns: list):
     cdef unique_ptr[device_buffer] up_db
     with nogil:
         c_result = move(cpp_bitmask_or(c_view))
-        up_db = make_unique[device_buffer](move(c_result.first))
+        up_db = move(make_unique[device_buffer](move(c_result.first)))
     dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
     buf = as_buffer(dbuf)
     return buf, c_result.second
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 7c861203d6c..d297c80ab5a 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -32,7 +32,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -51,6 +51,7 @@ from cudf._lib.cpp.io.parquet cimport (
     write_parquet as parquet_writer,
 )
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport NativeFileDatasource
@@ -641,7 +642,7 @@ cpdef merge_filemetadata(object filemetadata_list):
 
     for blob_py in filemetadata_list:
         blob_c = blob_py
-        list_c.push_back(make_unique[vector[uint8_t]](blob_c))
+        list_c.push_back(move(make_unique[vector[uint8_t]](blob_c)))
 
     with nogil:
         output_c = move(parquet_merge_metadata(list_c))
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index bff60e63fdb..52278188281 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -1,12 +1,14 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
 from libcpp.map cimport map
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
+from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.types cimport datasource
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
@@ -50,20 +52,20 @@ cdef class KafkaDatasource(Datasource):
 
         if topic != b"" and partition != -1:
             self.c_datasource = <unique_ptr[datasource]> \
-                make_unique[kafka_consumer](configs,
-                                            python_callable,
-                                            python_callable_wrapper,
-                                            topic,
-                                            partition,
-                                            start_offset,
-                                            end_offset,
-                                            batch_timeout,
-                                            delimiter)
+                move(make_unique[kafka_consumer](configs,
+                                                 python_callable,
+                                                 python_callable_wrapper,
+                                                 topic,
+                                                 partition,
+                                                 start_offset,
+                                                 end_offset,
+                                                 batch_timeout,
+                                                 delimiter))
         else:
             self.c_datasource = <unique_ptr[datasource]> \
-                make_unique[kafka_consumer](configs,
-                                            python_callable,
-                                            python_callable_wrapper)
+                move(make_unique[kafka_consumer](configs,
+                                                 python_callable,
+                                                 python_callable_wrapper))
 
     cdef datasource* get_datasource(self) nogil:
         return <datasource *> self.c_datasource.get()

From 80641708d7dff778263d738445ddca07d2bce19e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 28 Jul 2023 06:10:24 -0500
Subject: [PATCH 005/230] Fix contains(`in`) method for `Series` (#13779)

Checking for boolean values in a range results in incorrect behavior:
```python
In [1]: True in range(0, 0)
Out[1]: False


In [3]: True in range(0, 2)
Out[3]: True
```

This results in the following bug:
```python
In [23]: s = cudf.Series([True, False])

In [24]: s[0]
Out[24]: True

In [25]: type(s[0])
Out[25]: numpy.bool_

In [26]: True in s
Out[26]: True

In [26]: True in s.to_pandas()
Out[26]: False
```

This PR fixes this issue by properly checking if an integer is passed to the `RangeIndex. __contains__`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13779
---
 python/cudf/cudf/core/index.py        |  4 ++++
 python/cudf/cudf/tests/test_series.py | 12 ++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index d9dffa97ae0..6de6d770c17 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -311,6 +311,10 @@ def __contains__(self, item):
             item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
         ):
             return False
+        try:
+            item = pd.core.dtypes.common.ensure_python_int(item)
+        except TypeError:
+            return False
         if not item % 1 == 0:
             return False
         return item in range(self._start, self._stop, self._step)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index af1e11f0e11..8f79737725f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2185,3 +2185,15 @@ def test_series_init_error():
         lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
         rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
     )
+
+
+@pytest.mark.parametrize("data", [[True, False, None], [10, 200, 300]])
+@pytest.mark.parametrize("index", [None, [10, 20, 30]])
+def test_series_contains(data, index):
+    ps = pd.Series(data, index=index)
+    gs = cudf.from_pandas(ps)
+
+    assert_eq(1 in ps, 1 in gs)
+    assert_eq(10 in ps, 10 in gs)
+    assert_eq(True in ps, True in gs)
+    assert_eq(False in ps, False in gs)

From 09100469b2493e877f43f73071439a9e7b7e6adb Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 28 Jul 2023 11:36:34 -0500
Subject: [PATCH 006/230] Raise error on constructing an array from mixed type
 inputs (#13768)

We currently are allowing construction of mixed-dtype by type-casting them into a common type as below:
```python

In [1]: import cudf

In [2]: import pandas as pd

In [3]: s = pd.Series([1, 2, 3], dtype='datetime64[ns]')


In [5]: p = pd.Series([10, 11])

In [6]: new_s = pd.concat([s, p])

In [7]: new_s
Out[7]:
0    1970-01-01 00:00:00.000000001
1    1970-01-01 00:00:00.000000002
2    1970-01-01 00:00:00.000000003
0                               10
1                               11
dtype: object

In [8]: cudf.Series(new_s)
Out[8]:
0   1970-01-01 00:00:00.000000
1   1970-01-01 00:00:00.000000
2   1970-01-01 00:00:00.000000
0   1970-01-01 00:00:00.000010
1   1970-01-01 00:00:00.000011
dtype: datetime64[us]
```
This behavior is incorrect and we are getting this from `pa.array` constructor. This PR ensures we do proper handling around such cases and raise an error.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13768
---
 python/cudf/cudf/core/column/column.py | 33 +++++++++++++++++++-------
 python/cudf/cudf/errors.py             |  6 ++++-
 python/cudf/cudf/tests/test_index.py   | 23 ++++++++++++++++++
 python/cudf/cudf/tests/test_series.py  |  7 ++++++
 4 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 89c5cbe2f5d..2f7cc4ba176 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -81,6 +81,7 @@
 )
 from cudf.core.missing import NA
 from cudf.core.mixins import BinaryOperand, Reducible
+from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
@@ -2027,6 +2028,10 @@ def as_column(
             )
         else:
             pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
+            if arbitrary.dtype == cudf.dtype("object") and isinstance(
+                pyarrow_array, (pa.DurationArray, pa.TimestampArray)
+            ):
+                raise TypeError("Cannot create column with mixed types")
             if isinstance(pyarrow_array.type, pa.Decimal128Type):
                 pyarrow_type = cudf.Decimal128Dtype.from_arrow(
                     pyarrow_array.type
@@ -2335,18 +2340,30 @@ def as_column(
                             _maybe_convert_to_default_type("float")
                         )
 
+                pyarrow_array = pa.array(
+                    arbitrary,
+                    type=pa_type,
+                    from_pandas=True if nan_as_null is None else nan_as_null,
+                )
+
+                if (
+                    isinstance(arbitrary, pd.Index)
+                    and arbitrary.dtype == cudf.dtype("object")
+                    and isinstance(
+                        pyarrow_array, (pa.DurationArray, pa.TimestampArray)
+                    )
+                ):
+                    raise MixedTypeError(
+                        "Cannot create column with mixed types"
+                    )
                 data = as_column(
-                    pa.array(
-                        arbitrary,
-                        type=pa_type,
-                        from_pandas=True
-                        if nan_as_null is None
-                        else nan_as_null,
-                    ),
+                    pyarrow_array,
                     dtype=dtype,
                     nan_as_null=nan_as_null,
                 )
-            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
+            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e:
+                if isinstance(e, MixedTypeError):
+                    raise TypeError(str(e))
                 if is_categorical_dtype(dtype):
                     sr = pd.Series(arbitrary, dtype="category")
                     data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
diff --git a/python/cudf/cudf/errors.py b/python/cudf/cudf/errors.py
index bd264940081..973e5b990b2 100644
--- a/python/cudf/cudf/errors.py
+++ b/python/cudf/cudf/errors.py
@@ -1,5 +1,9 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 
 class UnsupportedCUDAError(Exception):
     pass
+
+
+class MixedTypeError(TypeError):
+    pass
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 03b4e76871c..62b58fc3d1a 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2897,3 +2897,26 @@ def test_scalar_getitem(self, index_values, i):
         assert not isinstance(index[i], cudf.Index)
         assert index[i] == index_values[i]
         assert_eq(index, index.to_pandas())
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            pd.Timestamp("1970-01-01 00:00:00.000000001"),
+            pd.Timestamp("1970-01-01 00:00:00.000000002"),
+            12,
+            20,
+        ],
+        [
+            pd.Timedelta(10),
+            pd.Timedelta(20),
+            12,
+            20,
+        ],
+    ],
+)
+def test_index_mixed_dtype_error(data):
+    pi = pd.Index(data)
+    with pytest.raises(TypeError):
+        cudf.Index(pi)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8f79737725f..8be1f431ab3 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2187,6 +2187,13 @@ def test_series_init_error():
     )
 
 
+@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+def test_series_mixed_dtype_error(dtype):
+    ps = pd.concat([pd.Series([1, 2, 3], dtype=dtype), pd.Series([10, 11])])
+    with pytest.raises(TypeError):
+        cudf.Series(ps)
+
+
 @pytest.mark.parametrize("data", [[True, False, None], [10, 200, 300]])
 @pytest.mark.parametrize("index", [None, [10, 20, 30]])
 def test_series_contains(data, index):

From 3dba6ea3aa8fea5179c1afda9a4b642c8dacbac2 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 28 Jul 2023 11:37:11 -0500
Subject: [PATCH 007/230] Fix negative unary operation for boolean type
 (#13780)

Negative unary op on boolean series is resulting in conversion to `int` type:
```python
In [1]: import cudf

In [2]: s = cudf.Series([True, False])

In [3]: s
Out[3]:
0     True
1    False
dtype: bool

In [4]: -s
Out[4]:
0   -1
1    0
dtype: int8

In [5]: -s.to_pandas()
Out[5]:
0    False
1     True
dtype: bool
```
The PR fixes the above issue by returning inversion of the boolean column instead of multiplying with `-1`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13780
---
 python/cudf/cudf/core/frame.py        | 43 +++++++++++++++++++++++++--
 python/cudf/cudf/tests/test_unaops.py |  6 ++++
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index dcf44344cdc..85f83953465 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -28,7 +28,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import Dtype
-from cudf.api.types import is_dtype_equal, is_scalar
+from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -97,6 +97,7 @@ def _dtypes(self):
     def _has_nulls(self):
         return any(col.has_nulls() for col in self._data.values())
 
+    @_cudf_nvtx_annotate
     def serialize(self):
         header = {
             "type-serialized": pickle.dumps(type(self)),
@@ -106,6 +107,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
+    @_cudf_nvtx_annotate
     def deserialize(cls, header, frames):
         cls_deserialize = pickle.loads(header["type-serialized"])
         column_names = pickle.loads(header["column_names"])
@@ -151,6 +153,7 @@ def _from_columns_like_self(
         frame = self.__class__._from_columns(columns, column_names)
         return frame._copy_type_metadata(self, override_dtypes=override_dtypes)
 
+    @_cudf_nvtx_annotate
     def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Optional[Self]:
@@ -166,6 +169,7 @@ def _mimic_inplace(
             return result
 
     @property
+    @_cudf_nvtx_annotate
     def size(self):
         """
         Return the number of elements in the underlying data.
@@ -243,11 +247,13 @@ def size(self):
         return self._num_columns * self._num_rows
 
     @property
+    @_cudf_nvtx_annotate
     def shape(self):
         """Returns a tuple representing the dimensionality of the DataFrame."""
         return self._num_rows, self._num_columns
 
     @property
+    @_cudf_nvtx_annotate
     def empty(self):
         """
         Indicator whether DataFrame or Series is empty.
@@ -308,6 +314,7 @@ def empty(self):
         """
         return self.size == 0
 
+    @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
         """Return the memory usage of an object.
 
@@ -323,6 +330,7 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
+    @_cudf_nvtx_annotate
     def __len__(self):
         return self._num_rows
 
@@ -425,6 +433,7 @@ def _get_columns_by_label(self, labels, downcast=False):
         return self._data.select_by_label(labels)
 
     @property
+    @_cudf_nvtx_annotate
     def values(self):
         """
         Return a CuPy representation of the DataFrame.
@@ -440,6 +449,7 @@ def values(self):
         return self.to_cupy()
 
     @property
+    @_cudf_nvtx_annotate
     def values_host(self):
         """
         Return a NumPy representation of the data.
@@ -454,6 +464,7 @@ def values_host(self):
         """
         return self.to_numpy()
 
+    @_cudf_nvtx_annotate
     def __array__(self, dtype=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
@@ -462,12 +473,14 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
+    @_cudf_nvtx_annotate
     def __arrow_array__(self, type=None):
         raise TypeError(
             "Implicit conversion to a host PyArrow object via __arrow_array__ "
             "is not allowed. Consider using .to_arrow()"
         )
 
+    @_cudf_nvtx_annotate
     def _to_array(
         self,
         get_column_values: Callable,
@@ -1183,6 +1196,7 @@ def to_arrow(self):
             {str(name): col.to_arrow() for name, col in self._data.items()}
         )
 
+    @_cudf_nvtx_annotate
     def _positions_from_column_names(self, column_names):
         """Map each column name into their positions in the frame.
 
@@ -1195,6 +1209,7 @@ def _positions_from_column_names(self, column_names):
             if name in set(column_names)
         ]
 
+    @_cudf_nvtx_annotate
     def _copy_type_metadata(
         self,
         other: Self,
@@ -1566,6 +1581,7 @@ def argsort(
             by=by, ascending=ascending, na_position=na_position
         ).values
 
+    @_cudf_nvtx_annotate
     def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
         """
         Get the indices required to sort self according to the columns
@@ -1768,9 +1784,11 @@ def _colwise_binop(
 
         return output
 
+    @_cudf_nvtx_annotate
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
 
+    @_cudf_nvtx_annotate
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
@@ -1884,30 +1902,45 @@ def dot(self, other, reflect=False):
             return cudf.DataFrame(result)
         return result.item()
 
+    @_cudf_nvtx_annotate
     def __matmul__(self, other):
         return self.dot(other)
 
+    @_cudf_nvtx_annotate
     def __rmatmul__(self, other):
         return self.dot(other, reflect=True)
 
     # Unary logical operators
+    @_cudf_nvtx_annotate
     def __neg__(self):
-        return -1 * self
+        """Negate for integral dtypes, logical NOT for bools."""
+        return self._from_data_like_self(
+            {
+                name: col.unary_operator("not")
+                if is_bool_dtype(col.dtype)
+                else -1 * col
+                for name, col in self._data.items()
+            }
+        )
 
+    @_cudf_nvtx_annotate
     def __pos__(self):
         return self.copy(deep=True)
 
+    @_cudf_nvtx_annotate
     def __abs__(self):
         return self._unaryop("abs")
 
     # Reductions
     @classmethod
+    @_cudf_nvtx_annotate
     def _get_axis_from_axis_arg(cls, axis):
         try:
             return cls._SUPPORT_AXIS_LOOKUP[axis]
         except KeyError:
             raise ValueError(f"No axis named {axis} for object type {cls}")
 
+    @_cudf_nvtx_annotate
     def _reduce(self, *args, **kwargs):
         raise NotImplementedError(
             f"Reductions are not supported for objects of type {type(self)}."
@@ -2577,6 +2610,7 @@ def to_string(self):
         """
         return repr(self)
 
+    @_cudf_nvtx_annotate
     def __str__(self):
         return self.to_string()
 
@@ -2780,6 +2814,7 @@ def __invert__(self):
             }
         )
 
+    @_cudf_nvtx_annotate
     def nunique(self, dropna: bool = True):
         """
         Returns a per column mapping with counts of unique values for
@@ -2801,6 +2836,7 @@ def nunique(self, dropna: bool = True):
         }
 
     @staticmethod
+    @_cudf_nvtx_annotate
     def _repeat(
         columns: List[ColumnBase], repeats, axis=None
     ) -> List[ColumnBase]:
@@ -2814,6 +2850,7 @@ def _repeat(
 
         return libcudf.filling.repeat(columns, repeats)
 
+    @_cudf_nvtx_annotate
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
         return [
@@ -2827,7 +2864,7 @@ def _apply_inverse_column(col: ColumnBase) -> ColumnBase:
     """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
     if np.issubdtype(col.dtype, np.integer):
         return col.unary_operator("invert")
-    elif np.issubdtype(col.dtype, np.bool_):
+    elif is_bool_dtype(col.dtype):
         return col.unary_operator("not")
     else:
         raise TypeError(
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index a7502051a78..15d9d03d4a7 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -123,3 +123,9 @@ def test_scalar_no_negative_bools():
         ),
     ):
         -x
+
+
+def test_series_bool_neg():
+    sr = Series([True, False, True, None, False, None, True, True])
+    psr = sr.to_pandas(nullable=True)
+    utils.assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True)

From f00e92220a337ad3af8c01c8c9e96f3c80e4f47e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 28 Jul 2023 12:05:35 -0500
Subject: [PATCH 008/230] Preserve names of column object in various APIs
 (#13772)

This PR preserves column names in various APIs by retaining `self._data._level_names` and also calculating when to preserve the column names.
Fixes: #13741, #13740

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13772
---
 python/cudf/cudf/core/dataframe.py       | 37 ++++++++++++++++++---
 python/cudf/cudf/core/indexed_frame.py   | 25 ++++++++++++---
 python/cudf/cudf/core/series.py          |  5 ++-
 python/cudf/cudf/tests/test_dataframe.py | 41 ++++++++++++++++++++++--
 4 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0fe89490905..fc6c669256f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -723,6 +723,10 @@ def __init__(
         if dtype:
             self._data = self.astype(dtype)._data
 
+        self._data.multiindex = self._data.multiindex or isinstance(
+            columns, pd.MultiIndex
+        )
+
     @_cudf_nvtx_annotate
     def _init_from_series_list(self, data, columns, index):
         if index is None:
@@ -1820,19 +1824,29 @@ def _make_operands_and_index_for_binop(
             NotImplementedType,
         ],
         Optional[BaseIndex],
+        bool,
     ]:
         lhs, rhs = self._data, other
         index = self._index
         fill_requires_key = False
         left_default: Any = False
+        equal_columns = False
+        can_use_self_column_name = True
 
         if _is_scalar_or_zero_d_array(other):
             rhs = {name: other for name in self._data}
+            equal_columns = True
         elif isinstance(other, Series):
             rhs = dict(zip(other.index.values_host, other.values_host))
             # For keys in right but not left, perform binops between NaN (not
             # NULL!) and the right value (result is NaN).
             left_default = as_column(np.nan, length=len(self))
+            equal_columns = other.index.to_pandas().equals(
+                self._data.to_pandas_index()
+            )
+            can_use_self_column_name = equal_columns or (
+                list(other._index._data.names) == self._data._level_names
+            )
         elif isinstance(other, DataFrame):
             if (
                 not can_reindex
@@ -1854,13 +1868,18 @@ def _make_operands_and_index_for_binop(
             # For DataFrame-DataFrame ops, always default to operating against
             # the fill value.
             left_default = fill_value
+            equal_columns = self._column_names == other._column_names
+            can_use_self_column_name = (
+                equal_columns
+                or self._data._level_names == other._data._level_names
+            )
         elif isinstance(other, (dict, abc.Mapping)):
             # Need to fail early on host mapping types because we ultimately
             # convert everything to a dict.
-            return NotImplemented, None
+            return NotImplemented, None, True
 
         if not isinstance(rhs, (dict, abc.Mapping)):
-            return NotImplemented, None
+            return NotImplemented, None, True
 
         operands = {
             k: (
@@ -1876,7 +1895,8 @@ def _make_operands_and_index_for_binop(
             for k, v in rhs.items():
                 if k not in lhs:
                     operands[k] = (left_default, v, reflect, None)
-        return operands, index
+
+        return operands, index, can_use_self_column_name
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -5042,6 +5062,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
 
         index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
         df = cls._from_data(data, index)
+        df._data._level_names = list(dataframe.columns.names)
 
         # Set columns only if it is a MultiIndex
         if isinstance(dataframe.columns, pd.MultiIndex):
@@ -5085,13 +5106,19 @@ def from_arrow(cls, table):
         2  3  6
         """
         index_col = None
+        col_index_names = None
         if isinstance(table, pa.Table) and isinstance(
             table.schema.pandas_metadata, dict
         ):
             index_col = table.schema.pandas_metadata["index_columns"]
+            if "column_indexes" in table.schema.pandas_metadata:
+                col_index_names = []
+                for col_meta in table.schema.pandas_metadata["column_indexes"]:
+                    col_index_names.append(col_meta["name"])
 
         out = super().from_arrow(table)
-
+        if col_index_names is not None:
+            out._data._level_names = col_index_names
         if index_col:
             if isinstance(index_col[0], dict):
                 idx = cudf.RangeIndex(
@@ -5337,6 +5364,8 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             df._data[names[0]] = column.as_column(
                 data, nan_as_null=nan_as_null
             )
+        if isinstance(columns, pd.Index):
+            df._data._level_names = list(columns.names)
 
         if index is None:
             df._index = RangeIndex(start=0, stop=len(data))
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index aa0f060c8da..0ffc3948e67 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -293,7 +293,9 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping):
-        return self._from_data(data, self._index)
+        out = self._from_data(data, self._index)
+        out._data._level_names = self._data._level_names
+        return out
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -3128,7 +3130,9 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""):
         # inserted to the left of existing data columns.
         return (
             ColumnAccessor(
-                {**new_column_data, **self._data}, self._data.multiindex
+                {**new_column_data, **self._data},
+                self._data.multiindex,
+                self._data._level_names,
             ),
             index,
         )
@@ -3465,14 +3469,24 @@ def _binaryop(
         **kwargs,
     ):
         reflect, op = self._check_reflected_op(op)
-        operands, out_index = self._make_operands_and_index_for_binop(
+        (
+            operands,
+            out_index,
+            can_use_self_column_name,
+        ) = self._make_operands_and_index_for_binop(
             other, op, fill_value, reflect, can_reindex
         )
         if operands is NotImplemented:
             return NotImplemented
 
+        level_names = (
+            None if not can_use_self_column_name else self._data._level_names
+        )
         return self._from_data(
-            ColumnAccessor(type(self)._colwise_binop(operands, op)),
+            ColumnAccessor(
+                type(self)._colwise_binop(operands, op),
+                level_names=level_names,
+            ),
             index=out_index,
         )
 
@@ -3491,6 +3505,7 @@ def _make_operands_and_index_for_binop(
             NotImplementedType,
         ],
         Optional[cudf.BaseIndex],
+        bool,
     ]:
         raise NotImplementedError(
             f"Binary operations are not supported for {self.__class__}"
@@ -3516,7 +3531,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         if cupy_func:
             if ufunc.nin == 2:
                 other = inputs[self is inputs[0]]
-                inputs, index = self._make_operands_and_index_for_binop(
+                inputs, index, _ = self._make_operands_and_index_for_binop(
                     other, fname
                 )
             else:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index aaac91e927a..02de3b8282a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1472,6 +1472,7 @@ def _make_operands_and_index_for_binop(
             NotImplementedType,
         ],
         Optional[BaseIndex],
+        bool,
     ]:
         # Specialize binops to align indices.
         if isinstance(other, Series):
@@ -1484,11 +1485,13 @@ def _make_operands_and_index_for_binop(
                     "Can only compare identically-labeled Series objects"
                 )
             lhs, other = _align_indices([self, other], allow_non_unique=True)
+            can_use_self_column_name = self.name == other.name
         else:
             lhs = self
+            can_use_self_column_name = False
 
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
-        return operands, lhs._index
+        return operands, lhs._index, can_use_self_column_name
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 25a17697538..d443cd92968 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1708,6 +1708,7 @@ def test_nonmatching_index_setitem(nrows):
 )
 def test_from_pandas(dtype):
     df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype)
+    df.columns.name = "custom_column_name"
     gdf = cudf.DataFrame.from_pandas(df)
     assert isinstance(gdf, cudf.DataFrame)
 
@@ -2483,8 +2484,15 @@ def test_bitwise_binops_series(pdf, gdf, binop):
 
 
 @pytest.mark.parametrize("unaryop", [operator.neg, operator.inv, operator.abs])
-def test_unaryops_df(pdf, gdf, unaryop):
-    d = unaryop(pdf - 5)
+@pytest.mark.parametrize(
+    "col_name,assign_col_name", [(None, False), (None, True), ("abc", True)]
+)
+def test_unaryops_df(pdf, unaryop, col_name, assign_col_name):
+    pd_df = pdf.copy()
+    if assign_col_name:
+        pd_df.columns.name = col_name
+    gdf = cudf.from_pandas(pd_df)
+    d = unaryop(pd_df - 5)
     g = unaryop(gdf - 5)
     assert_eq(d, g)
 
@@ -2626,6 +2634,12 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
     pdf2 = pdf_arrow_table.to_pandas()
 
     assert_eq(pdf2, gdf2)
+    pdf.columns.name = "abc"
+    pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index)
+
+    gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table)
+    pdf2 = pdf_arrow_table.to_pandas()
+    assert_eq(pdf2, gdf2)
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
@@ -2912,6 +2926,7 @@ def test_tail_for_string():
         ["v0", "v1"],
         ["v0", "index"],
         pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]),
+        pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]),
     ],
 )
 @pytest.mark.parametrize("inplace", [True, False])
@@ -10147,3 +10162,25 @@ def test_dataframe_init_length_error(data, index):
             {"data": data, "index": index},
         ),
     )
+
+
+def test_dataframe_init_columns_named_multiindex():
+    np.random.seed(0)
+    data = np.random.randn(2, 2)
+    columns = cudf.MultiIndex.from_tuples(
+        [("A", "one"), ("A", "two")], names=["y", "z"]
+    )
+    gdf = cudf.DataFrame(data, columns=columns)
+    pdf = pd.DataFrame(data, columns=columns.to_pandas())
+
+    assert_eq(gdf, pdf)
+
+
+def test_dataframe_init_columns_named_index():
+    np.random.seed(0)
+    data = np.random.randn(2, 2)
+    columns = pd.Index(["a", "b"], name="custom_name")
+    gdf = cudf.DataFrame(data, columns=columns)
+    pdf = pd.DataFrame(data, columns=columns)
+
+    assert_eq(gdf, pdf)

From 2dabf0c65c2c8457a5f881029ce62e238652ba35 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 28 Jul 2023 21:21:28 +0100
Subject: [PATCH 009/230] Fix construction of DataFrames from dict when columns
 are provided (#13766)

If a columns argument is provided to the dataframe constructor, this should be used to select columns from the provided data dictionary. The previous logic did do this correctly, but didn't preserve the appropriate order of the resulting columns (which should come out in the order that the column selection is in).

- Closes #13738

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13766
---
 python/cudf/cudf/core/dataframe.py       | 49 +++++++++++-------------
 python/cudf/cudf/tests/test_dataframe.py | 31 +++++++--------
 2 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fc6c669256f..dbbf0f0017a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -868,31 +868,29 @@ def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
         if columns is not None:
-            # remove all entries in `data` that are
-            # not in `columns`
-            keys = [key for key in data.keys() if key in columns]
-            extra_cols = [col for col in columns if col not in keys]
-            if keys:
-                # if keys is non-empty,
-                # add null columns for all values
-                # in `columns` that don't exist in `keys`:
-                data = {key: data[key] for key in keys}
-                data.update({key: None for key in extra_cols})
+            # remove all entries in data that are not in columns,
+            # inserting new empty columns for entries in columns that
+            # are not in data
+            if any(c in data for c in columns):
+                # Let the downstream logic determine the length of the
+                # empty columns here
+                empty_column = lambda: None  # noqa: E731
             else:
-                # If keys is empty, none of the data keys match the columns, so
-                # we need to create an empty DataFrame. To match pandas, the
-                # size of the dataframe must match the provided index, so we
-                # need to return a masked array of nulls if an index is given.
-                row_count = 0 if index is None else len(index)
-                masked = index is not None
-                data = {
-                    key: cudf.core.column.column_empty(
-                        row_count=row_count,
-                        dtype=None,
-                        masked=masked,
-                    )
-                    for key in extra_cols
-                }
+                # If keys is empty, none of the data keys match the
+                # columns, so we need to create an empty DataFrame. To
+                # match pandas, the size of the dataframe must match
+                # the provided index, so we need to return a masked
+                # array of nulls if an index is given.
+                empty_column = functools.partial(
+                    cudf.core.column.column_empty,
+                    row_count=(0 if index is None else len(index)),
+                    dtype=None,
+                    masked=index is not None,
+                )
+
+            data = {
+                c: data[c] if c in data else empty_column() for c in columns
+            }
 
         data, index = self._align_input_series_indices(data, index=index)
 
@@ -934,9 +932,6 @@ def _init_from_dict_like(
                     nan_as_null=nan_as_null,
                 )
 
-        if columns is not None:
-            self.columns = columns
-
     @classmethod
     def _from_data(
         cls,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d443cd92968..9ec03d4fd8c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -240,7 +240,6 @@ def test_series_from_cupy_scalars():
 @pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]])
 @pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]])
 def test_append_index(a, b):
-
     df = pd.DataFrame()
     df["a"] = a
     df["b"] = b
@@ -368,7 +367,6 @@ def test_dataframe_truncate_datetimeindex():
 
 
 def test_series_init_none():
-
     # test for creating empty series
     # 1: without initializing
     sr1 = cudf.Series()
@@ -1503,7 +1501,6 @@ def test_dataframe_concat_different_column_types():
     "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})]
 )
 def test_concat_empty_dataframe(df_1, df_2):
-
     got = cudf.concat([df_1, df_2])
     expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False)
 
@@ -2644,7 +2641,6 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 def test_cuda_array_interface(dtype):
-
     np_data = np.arange(10).astype(dtype)
     cupy_data = cupy.array(np_data)
     pd_data = pd.Series(np_data)
@@ -3822,7 +3818,6 @@ def test_diff(dtype, period, data_empty):
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_dataframe_isnull_isna(df, nan_as_null):
-
     gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
 
     assert_eq(df.isnull(), gdf.isnull())
@@ -3837,7 +3832,6 @@ def test_dataframe_isnull_isna(df, nan_as_null):
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_dataframe_notna_notnull(df, nan_as_null):
-
     gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
 
     assert_eq(df.notnull(), gdf.notnull())
@@ -5217,7 +5211,6 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected):
 @pytest.mark.parametrize("op", ["max", "min"])
 @pytest.mark.parametrize("skipna", [True, False])
 def test_rowwise_ops_datetime_dtypes(data, op, skipna):
-
     gdf = cudf.DataFrame(data)
 
     pdf = gdf.to_pandas()
@@ -5281,7 +5274,6 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna):
     ],
 )
 def test_rowwise_ops_datetime_dtypes_2(data, op, skipna):
-
     gdf = cudf.DataFrame(data)
 
     pdf = gdf.to_pandas()
@@ -5529,13 +5521,11 @@ def test_memory_usage(deep, index, set_index):
     gdf = cudf.from_pandas(df)
 
     if index and set_index is None:
-
         # Special Case: Assume RangeIndex size == 0
         with expect_warning_if(deep, UserWarning):
             assert gdf.index.memory_usage(deep=deep) == 0
 
     else:
-
         # Check for Series only
         assert df["B"].memory_usage(index=index, deep=deep) == gdf[
             "B"
@@ -6249,7 +6239,6 @@ def test_from_pandas_unsupported_types(data, expected_upcast_type, error):
 @pytest.mark.parametrize("nan_as_null", [True, False])
 @pytest.mark.parametrize("index", [None, "a", ["a", "b"]])
 def test_from_pandas_nan_as_null(nan_as_null, index):
-
     data = [np.nan, 2.0, 3.0]
 
     if index is None:
@@ -6283,7 +6272,6 @@ def test_from_pandas_nan_as_null(nan_as_null, index):
 
 @pytest.mark.parametrize("nan_as_null", [True, False])
 def test_from_pandas_for_series_nan_as_null(nan_as_null):
-
     data = [np.nan, 2.0, 3.0]
     psr = pd.Series(data)
 
@@ -6428,7 +6416,6 @@ def test_dataframe_init_1d_list(data, columns):
     ],
 )
 def test_dataframe_init_from_arrays_cols(data, cols, index):
-
     gd_data = data
     if isinstance(data, cupy.ndarray):
         # pandas can't handle cupy arrays in general
@@ -6564,7 +6551,6 @@ def test_dataframe_assign_scalar_with_scalar_cols(col_data, assign_val):
 
 
 def test_dataframe_info_basic():
-
     buffer = io.StringIO()
     str_cmp = textwrap.dedent(
         """\
@@ -7096,7 +7082,6 @@ def test_dataframe_to_dict(orient, into):
     ],
 )
 def test_dataframe_from_dict(data, orient, dtype, columns):
-
     expected = pd.DataFrame.from_dict(
         data=data, orient=orient, dtype=dtype, columns=columns
     )
@@ -7194,7 +7179,6 @@ def test_dataframe_from_dict_transposed(dtype):
 def test_dataframe_from_dict_cp_np_arrays(
     pd_data, gd_data, orient, dtype, columns
 ):
-
     expected = pd.DataFrame.from_dict(
         data=pd_data, orient=orient, dtype=dtype, columns=columns
     )
@@ -10019,7 +10003,6 @@ def test_non_string_column_name_to_arrow(data):
 
 
 def test_complex_types_from_arrow():
-
     expected = pa.Table.from_arrays(
         [
             pa.array([1, 2, 3]),
@@ -10164,6 +10147,20 @@ def test_dataframe_init_length_error(data, index):
     )
 
 
+@pytest.mark.parametrize(
+    "columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"])
+)
+@pytest.mark.parametrize("index", (None, [4, 5, 6]))
+def test_dataframe_dict_like_with_columns(columns, index):
+    data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
+    expect = pd.DataFrame(data, columns=columns, index=index)
+    actual = cudf.DataFrame(data, columns=columns, index=index)
+    if index is None and columns == []:
+        # We make an empty range index, pandas makes an empty index
+        expect = expect.reset_index(drop=True)
+    assert_eq(expect, actual)
+
+
 def test_dataframe_init_columns_named_multiindex():
     np.random.seed(0)
     data = np.random.randn(2, 2)

From 70b8f1f5513f55e4101a8f03f2d8faf12639d94a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 28 Jul 2023 17:10:53 -0500
Subject: [PATCH 010/230] Fix binary operation column ordering and missing
 column issues (#13778)

This PR fixes various cases in binary operations where columns are of certain dtypes and the binary operations on those dataframes and series don't yield correct results, correct resulting column types, or have missing columns altogether.
This PR also introduces ensuring column ordering to match pandas binary ops column ordering when pandas compatibility mode is enabled.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13778
---
 python/cudf/cudf/core/dataframe.py       | 29 +++++++---
 python/cudf/cudf/core/indexed_frame.py   |  2 +-
 python/cudf/cudf/tests/test_dataframe.py | 70 ++++++++++++++++++++++--
 3 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index dbbf0f0017a..a510e6829d1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1832,15 +1832,16 @@ def _make_operands_and_index_for_binop(
             rhs = {name: other for name in self._data}
             equal_columns = True
         elif isinstance(other, Series):
-            rhs = dict(zip(other.index.values_host, other.values_host))
+            rhs = dict(zip(other.index.to_pandas(), other.values_host))
             # For keys in right but not left, perform binops between NaN (not
             # NULL!) and the right value (result is NaN).
             left_default = as_column(np.nan, length=len(self))
             equal_columns = other.index.to_pandas().equals(
                 self._data.to_pandas_index()
             )
-            can_use_self_column_name = equal_columns or (
-                list(other._index._data.names) == self._data._level_names
+            can_use_self_column_name = (
+                equal_columns
+                or list(other._index._data.names) == self._data._level_names
             )
         elif isinstance(other, DataFrame):
             if (
@@ -1891,6 +1892,20 @@ def _make_operands_and_index_for_binop(
                 if k not in lhs:
                     operands[k] = (left_default, v, reflect, None)
 
+        if not equal_columns:
+            if isinstance(other, DataFrame):
+                column_names_list = self._data.to_pandas_index().join(
+                    other._data.to_pandas_index(), how="outer"
+                )
+            elif isinstance(other, Series):
+                column_names_list = self._data.to_pandas_index().join(
+                    other.index.to_pandas(), how="outer"
+                )
+            else:
+                raise ValueError("other must be a DataFrame or Series.")
+
+            sorted_dict = {key: operands[key] for key in column_names_list}
+            return sorted_dict, index, can_use_self_column_name
         return operands, index, can_use_self_column_name
 
     @classmethod
@@ -7467,13 +7482,13 @@ def _align_indices(lhs, rhs):
         lhs_out = DataFrame(index=df.index)
         rhs_out = DataFrame(index=df.index)
         common = set(lhs._column_names) & set(rhs._column_names)
-        common_x = {f"{x}_x" for x in common}
-        common_y = {f"{x}_y" for x in common}
+        common_x = {f"{x}_x": x for x in common}
+        common_y = {f"{x}_y": x for x in common}
         for col in df._column_names:
             if col in common_x:
-                lhs_out[col[:-2]] = df[col]
+                lhs_out[common_x[col]] = df[col]
             elif col in common_y:
-                rhs_out[col[:-2]] = df[col]
+                rhs_out[common_y[col]] = df[col]
             elif col in lhs:
                 lhs_out[col] = df[col]
             elif col in rhs:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 0ffc3948e67..5bd19b9f9c1 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3480,7 +3480,7 @@ def _binaryop(
             return NotImplemented
 
         level_names = (
-            None if not can_use_self_column_name else self._data._level_names
+            self._data._level_names if can_use_self_column_name else None
         )
         return self._from_data(
             ColumnAccessor(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 9ec03d4fd8c..aad0b757177 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5388,10 +5388,7 @@ def test_cov_nans():
         cudf.Series([4, 2, 3], index=["a", "b", "d"]),
         cudf.Series([4, 2], index=["a", "b"]),
         cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
-        pytest.param(
-            cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
-            marks=pytest_xfail,
-        ),
+        cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
     ],
 )
 @pytest.mark.parametrize("colnames", [["a", "b", "c"], [0, 1, 2]])
@@ -10147,6 +10144,71 @@ def test_dataframe_init_length_error(data, index):
     )
 
 
+def test_dataframe_binop_with_mixed_date_types():
+    df = pd.DataFrame(
+        np.random.rand(2, 2),
+        columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"),
+    )
+    ser = pd.Series(np.random.rand(3), index=[0, 1, 2])
+    gdf = cudf.from_pandas(df)
+    gser = cudf.from_pandas(ser)
+    expected = df - ser
+    got = gdf - gser
+    assert_eq(expected, got)
+
+
+def test_dataframe_binop_with_mixed_string_types():
+    df1 = pd.DataFrame(np.random.rand(3, 3), columns=pd.Index([0, 1, 2]))
+    df2 = pd.DataFrame(
+        np.random.rand(6, 6),
+        columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]),
+    )
+    gdf1 = cudf.from_pandas(df1)
+    gdf2 = cudf.from_pandas(df2)
+
+    expected = df2 + df1
+    got = gdf2 + gdf1
+
+    assert_eq(expected, got)
+
+
+def test_dataframe_binop_and_where():
+    df = pd.DataFrame(np.random.rand(2, 2), columns=pd.Index([True, False]))
+    gdf = cudf.from_pandas(df)
+
+    expected = df > 1
+    got = gdf > 1
+
+    assert_eq(expected, got)
+
+    expected = df[df > 1]
+    got = gdf[gdf > 1]
+
+    assert_eq(expected, got)
+
+
+def test_dataframe_binop_with_datetime_index():
+    df = pd.DataFrame(
+        np.random.rand(2, 2),
+        columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"),
+    )
+    ser = pd.Series(
+        np.random.rand(2),
+        index=pd.Index(
+            [
+                "2000-01-04",
+                "2000-01-03",
+            ],
+            dtype="datetime64[ns]",
+        ),
+    )
+    gdf = cudf.from_pandas(df)
+    gser = cudf.from_pandas(ser)
+    expected = df - ser
+    got = gdf - gser
+    assert_eq(expected, got)
+
+
 @pytest.mark.parametrize(
     "columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"])
 )

From 9d465f258ffdfe88e7b661cce8cae59741113ae0 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 31 Jul 2023 08:29:11 -0500
Subject: [PATCH 011/230] Upgrade to arrow 12 (#13728)

This PR upgrades arrow version in `cudf` to `12.0.1`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/13728
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 4 ++--
 conda/environments/all_cuda-120_arch-x86_64.yaml | 4 ++--
 conda/recipes/cudf/meta.yaml                     | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 2 +-
 cpp/cmake/thirdparty/get_arrow.cmake             | 2 +-
 cpp/tests/io/arrow_io_source_test.cpp            | 1 +
 dependencies.yaml                                | 8 ++++----
 python/cudf/pyproject.toml                       | 4 ++--
 python/cudf_kafka/pyproject.toml                 | 2 +-
 9 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8c97db4d496..4fad893f768 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -38,7 +38,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==11.0.0.*
+- libarrow==12.0.1.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -66,7 +66,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==11.0.0.*
+- pyarrow==12.0.1.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 03430a8d346..1a4abe64b95 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -39,7 +39,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==11.0.0.*
+- libarrow==12.0.1.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==23.10.*
@@ -63,7 +63,7 @@ dependencies:
 - pip
 - pre-commit
 - protobuf>=4.21,<5
-- pyarrow==11.0.0.*
+- pyarrow==12.0.1.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 0249efcedf6..612d2b177f6 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow =11
+    - pyarrow =12
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 4d03dab2f35..0397045786b 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -23,7 +23,7 @@ gtest_version:
   - ">=1.13.0"
 
 libarrow_version:
-  - "=11"
+  - "=12"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index c877c9c6466..036ef880f99 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -381,7 +381,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      11.0.0
+      12.0.1
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index 2961deec384..d7f1879040b 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -87,6 +87,7 @@ TEST_F(ArrowIOTest, S3FileSystem)
     ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
     ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
   }
+  CUDF_EXPECTS(arrow::fs::EnsureS3Finalized().ok(), "Failed to finalize s3 filesystem");
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/dependencies.yaml b/dependencies.yaml
index 6bf78d18a41..fc3bf498918 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -223,7 +223,7 @@ dependencies:
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - libarrow==11.0.0.*
+          - libarrow==12.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
           - spdlog>=1.11.0,<1.12
     specific:
@@ -253,7 +253,7 @@ dependencies:
           - cython>=0.29,<0.30
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==11.0.0.*
+          - pyarrow==12.0.1.*
           - numpy>=1.21
   build_python:
     common:
@@ -272,13 +272,13 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           # Allow runtime version to float up to minor version
-          - libarrow==11.*
+          - libarrow==12.*
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          - pyarrow==11.*
+          - pyarrow==12.*
   cudatoolkit:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index e37c3642082..5cd63893844 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "ninja",
     "numpy>=1.21",
     "protoc-wheel",
-    "pyarrow==11.0.0.*",
+    "pyarrow==12.0.1.*",
     "rmm==23.10.*",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -38,7 +38,7 @@ dependencies = [
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==11.*",
+    "pyarrow==12.*",
     "rmm==23.10.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index d458969d40f..081a2f69800 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=0.29,<0.30",
     "numpy>=1.21",
-    "pyarrow==11.0.0.*",
+    "pyarrow==12.0.1.*",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 02357b17ff3b2cc3e8840281e7a9b0ed159cdcf0 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 31 Jul 2023 12:04:05 -0500
Subject: [PATCH 012/230] Raise error when trying to join `datetime` and
 `timedelta` types with other types (#13786)

This PR fixes an issue when trying to merge a `datetime`|`timdelta` type column with another type:
```python

In [1]: import cudf

In [2]: import pandas as pd

In [3]: df = cudf.DataFrame({'a': cudf.Series([10, 20, 30], dtype='datetime64[ns]')})

In [4]: df2 = df.astype('int')

In [5]: df.merge(df2)
Out[5]:
      a
0  10.0
1  20.0
2  30.0

In [6]: df2.merge(df)
Out[6]:
      a
0  10.0
1  20.0
2  30.0

In [7]: df
Out[7]:
                              a
0 1970-01-01 00:00:00.000000010
1 1970-01-01 00:00:00.000000020
2 1970-01-01 00:00:00.000000030

In [8]: df2
Out[8]:
    a
0  10
1  20
2  30

In [9]: df.to_pandas().merge(df2.to_pandas())
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[9], line 1
----> 1 df.to_pandas().merge(df2.to_pandas())

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pandas/core/frame.py:10092, in DataFrame.merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
  10073 @Substitution("")
  10074 @Appender(_merge_doc, indents=2)
  10075 def merge(
   (...)
  10088     validate: str | None = None,
  10089 ) -> DataFrame:
  10090     from pandas.core.reshape.merge import merge
> 10092     return merge(
  10093         self,
  10094         right,
  10095         how=how,
  10096         on=on,
  10097         left_on=left_on,
  10098         right_on=right_on,
  10099         left_index=left_index,
  10100         right_index=right_index,
  10101         sort=sort,
  10102         suffixes=suffixes,
  10103         copy=copy,
  10104         indicator=indicator,
  10105         validate=validate,
  10106     )

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pandas/core/reshape/merge.py:110, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
     93 @Substitution("\nleft : DataFrame or named Series")
     94 @Appender(_merge_doc, indents=0)
     95 def merge(
   (...)
    108     validate: str | None = None,
    109 ) -> DataFrame:
--> 110     op = _MergeOperation(
    111         left,
    112         right,
    113         how=how,
    114         on=on,
    115         left_on=left_on,
    116         right_on=right_on,
    117         left_index=left_index,
    118         right_index=right_index,
    119         sort=sort,
    120         suffixes=suffixes,
    121         indicator=indicator,
    122         validate=validate,
    123     )
    124     return op.get_result(copy=copy)

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pandas/core/reshape/merge.py:707, in _MergeOperation.__init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, indicator, validate)
    699 (
    700     self.left_join_keys,
    701     self.right_join_keys,
    702     self.join_names,
    703 ) = self._get_merge_keys()
    705 # validate the merge keys dtypes. We may need to coerce
    706 # to avoid incompatible dtypes
--> 707 self._maybe_coerce_merge_keys()
    709 # If argument passed to validate,
    710 # check if columns specified as unique
    711 # are in fact unique.
    712 if validate is not None:

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1344, in _MergeOperation._maybe_coerce_merge_keys(self)
   1342 # datetimelikes must match exactly
   1343 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
-> 1344     raise ValueError(msg)
   1345 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):
   1346     raise ValueError(msg)

ValueError: You are trying to merge on datetime64[ns] and int64 columns. If you wish to proceed you should use pd.concat
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13786
---
 python/cudf/cudf/core/join/_join_helpers.py | 36 ++++++++++++++++++---
 python/cudf/cudf/tests/test_joining.py      | 34 +++++++++++++++++++
 2 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 885a0e41b66..7d799fa1573 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -85,17 +85,43 @@ def _match_join_keys(
             "of the same precision and scale"
         )
 
-    if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
+    if (
+        np.issubdtype(ltype, np.number)
+        and np.issubdtype(rtype, np.number)
+        and not (
+            np.issubdtype(ltype, np.timedelta64)
+            or np.issubdtype(rtype, np.timedelta64)
+        )
+    ):
         common_type = (
             max(ltype, rtype)
             if ltype.kind == rtype.kind
             else np.find_common_type([], (ltype, rtype))
         )
-
-    elif np.issubdtype(ltype, np.datetime64) and np.issubdtype(
-        rtype, np.datetime64
+    elif (
+        np.issubdtype(ltype, np.datetime64)
+        and np.issubdtype(rtype, np.datetime64)
+    ) or (
+        np.issubdtype(ltype, np.timedelta64)
+        and np.issubdtype(rtype, np.timedelta64)
     ):
         common_type = max(ltype, rtype)
+    elif (
+        np.issubdtype(ltype, np.datetime64)
+        or np.issubdtype(ltype, np.timedelta64)
+    ) and not rcol.fillna(0).can_cast_safely(ltype):
+        raise TypeError(
+            f"Cannot join between {ltype} and {rtype}, please type-cast both "
+            "columns to the same type."
+        )
+    elif (
+        np.issubdtype(rtype, np.datetime64)
+        or np.issubdtype(rtype, np.timedelta64)
+    ) and not lcol.fillna(0).can_cast_safely(rtype):
+        raise TypeError(
+            f"Cannot join between {rtype} and {ltype}, please type-cast both "
+            "columns to the same type."
+        )
 
     if how == "left" and rcol.fillna(0).can_cast_safely(ltype):
         return lcol, rcol.astype(ltype)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 0a1e6f306c3..d3e75dd4a0c 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -11,6 +11,7 @@
 from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
@@ -2229,3 +2230,36 @@ def test_index_join_names(how):
     expected = idx1.to_pandas().join(idx2.to_pandas(), how=how)
     actual = idx1.join(idx2, how=how)
     assert_join_results_equal(actual, expected, how=how)
+
+
+@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+def test_join_datetime_timedelta_error(dtype):
+    df1 = cudf.DataFrame({"a": cudf.Series([10, 20, 30], dtype=dtype)})
+    df2 = df1.astype("int")
+
+    with pytest.raises(TypeError):
+        df1.merge(df2)
+
+
+@pytest.mark.parametrize("dtype1", TIMEDELTA_TYPES)
+@pytest.mark.parametrize("dtype2", TIMEDELTA_TYPES)
+def test_merge_timedelta_types(dtype1, dtype2):
+    df1 = cudf.DataFrame({"a": cudf.Series([10, 20, 30], dtype=dtype1)})
+    df2 = cudf.DataFrame({"a": cudf.Series([20, 500, 33240], dtype=dtype2)})
+
+    pdf1 = df1.to_pandas()
+    pdf2 = df2.to_pandas()
+    actual = df1.merge(df2)
+    expected = pdf1.merge(pdf2)
+
+    # Pandas is materializing the index, which is unnecessary
+    # hence the special handling.
+    assert_eq(
+        actual,
+        expected,
+        check_index_type=False
+        if isinstance(actual.index, cudf.RangeIndex)
+        and isinstance(expected.index, pd.Index)
+        else True,
+        check_dtype=True,
+    )

From 1a75b3c4301107bd874e09dcf33a24cfb288c7f9 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 31 Jul 2023 18:38:31 -0700
Subject: [PATCH 013/230] Update documentation to reflect recent changes in
 JSON reader and writer (#13791)

Adds JSON reader and writer to the list of components that support GDS.
Updates the supported data types in JSON reader and writer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13791
---
 docs/cudf/source/user_guide/io/io.md | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md
index c62351925f0..adcdaa51e7e 100644
--- a/docs/cudf/source/user_guide/io/io.md
+++ b/docs/cudf/source/user_guide/io/io.md
@@ -17,7 +17,7 @@ IO format.
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+-------------------+--------+--------+---------+---------+
     |                       |       CSV       |      Parquet    |       JSON       |       ORC       |  AVRO  |        HDF        |       DLPack    |      Feather      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | Data Type             | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader | Writer¹ | Reader¹ | Writer | Reader | Writer¹ | Reader¹ |
+    | Data Type             | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader | Writer² | Reader² | Writer | Reader | Writer² | Reader² |
     +=======================+========+========+========+========+=========+========+========+========+========+=========+=========+========+========+=========+=========+
     | int8                  | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
@@ -43,9 +43,9 @@ IO format.
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | str                   | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | category              | ✅     | ❌     | ❌     | ❌     | ✅      | ❌     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    | category              | ✅     | ❌     | ❌     | ❌     | ❌      | ❌     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | list                  | ❌     | ❌     | ✅     | ✅     | ✅      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ✅      | ✅      |
+    | list                  | ❌     | ❌     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | timedelta64[s]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
@@ -63,13 +63,13 @@ IO format.
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | datetime64[ns]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | struct                | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    | struct                | ❌     | ❌     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | decimal32             | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    | decimal32             | ✅     | ✅     | ✅     | ✅     | ✅      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | decimal64             | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    | decimal64             | ✅     | ✅     | ✅     | ✅     | ✅      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | decimal128            | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    | decimal128            | ✅     | ✅     | ✅     | ✅     | ✅      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
 ```
 
@@ -78,7 +78,8 @@ IO format.
 
 **Notes:**
 
-- \[¹\] - Not GPU-accelerated.
+- \[¹\] - Not all orientations are GPU-accelerated.
+- \[²\] - Not GPU-accelerated.
 
 ## Magnum IO GPUDirect Storage Integration
 
@@ -123,9 +124,11 @@ tuning parameters in KvikIO see: <https://github.com/rapidsai/kvikio>
 Operations that support the use of GPUDirect Storage:
 
 - {py:func}`cudf.read_avro`
+- {py:func}`cudf.read_json`
 - {py:func}`cudf.read_parquet`
 - {py:func}`cudf.read_orc`
 - {py:meth}`cudf.DataFrame.to_csv`
+- {py:func}`cudf.DataFrame.to_json`
 - {py:meth}`cudf.DataFrame.to_parquet`
 - {py:meth}`cudf.DataFrame.to_orc`
 

From c7b5d228e3d6f603f6699ca22c96cb1894827a04 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 1 Aug 2023 10:30:39 -0400
Subject: [PATCH 014/230] Update get_arrow to arrows 12 CMake target name of
 arrow::xsimd (#13790)

Arrow 12.0 uses the vendored CMake target `arrow::xsimd` instead of the global target name of `xsimd`. We need to use the new name so that libcudf can be used from the build directory by other projects.

Found by issue: https://github.com/NVIDIA/spark-rapids-jni/issues/1306

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13790
---
 cpp/cmake/thirdparty/get_arrow.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 036ef880f99..5934c8a2668 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -295,9 +295,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         APPEND
         arrow_code_string
         "
-          if(NOT TARGET xsimd)
-            add_library(xsimd INTERFACE IMPORTED)
-            target_include_directories(xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\")
+          if(NOT TARGET arrow::xsimd)
+            add_library(arrow::xsimd INTERFACE IMPORTED)
+            target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\")
           endif()
         "
       )

From 3254934dc7f369296d9619e7926635c138ca0eb5 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Tue, 1 Aug 2023 09:01:28 -0700
Subject: [PATCH 015/230] Move Spark-indpendent Table debug to cudf Java
 (#13783)

spark-rapids has [code for debugging JNI Tables/Columns][1] that is useful for debugging during dev work in cudf

This PR proposes to start moving it to cudf/java. spark-rapids will be updated to call into the cudf in a follow-up PR.

[1]: https://github.com/NVIDIA/spark-rapids/blob/b5cf25eef347d845bd77077d5cb9035262281f98/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java

## Sample Usage with JShell
```Bash
(rapids) rapids@compose:~/cudf/java$ mvn dependency:build-classpath -Dmdep.outputFile=cudf-java-cp.txt
(rapids) rapids@compose:~/cudf/java$ jshell --class-path target/cudf-23.10.0-SNAPSHOT-cuda12.jar:$(< cudf-java-cp.txt) \
    -R -Dai.rapids.cudf.debug.output=log_error
```
```Java
|  Welcome to JShell -- Version 11.0.20
|  For an introduction type: /help intro

jshell> import ai.rapids.cudf.*;

jshell> Table tbl = new Table.TestBuilder().column(1,2,3,4,5,6).build()
tbl ==> Table{columns=[ColumnVector{rows=6, type=INT32, n ... e=140381937458144, rows=6}

jshell> TableDebug.get().debug("gera", tbl)
[main] ERROR ai.rapids.cudf.TableDebug - DEBUG gera Table{columns=[ColumnVector{rows=6, type=INT32, nullCount=Optional[0], offHeap=(ID: 4 7fad371d1a30)}], cudfTable=140381937458144, rows=6}
[main] ERROR ai.rapids.cudf.TableDebug - GPU COLUMN 0 - NC: 0 DATA: DeviceMemoryBufferView{address=0x7fad3be00000, length=24, id=-1} VAL: null
[main] ERROR ai.rapids.cudf.TableDebug - COLUMN 0 - INT32
[main] ERROR ai.rapids.cudf.TableDebug - 0 1
[main] ERROR ai.rapids.cudf.TableDebug - 1 2
[main] ERROR ai.rapids.cudf.TableDebug - 2 3
[main] ERROR ai.rapids.cudf.TableDebug - 3 4
[main] ERROR ai.rapids.cudf.TableDebug - 4 5
[main] ERROR ai.rapids.cudf.TableDebug - 5 6
```

Authors:
  - Gera Shegalov (https://github.com/gerashegalov)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13783
---
 .../main/java/ai/rapids/cudf/TableDebug.java  | 280 ++++++++++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   |  11 +
 2 files changed, 291 insertions(+)
 create mode 100644 java/src/main/java/ai/rapids/cudf/TableDebug.java

diff --git a/java/src/main/java/ai/rapids/cudf/TableDebug.java b/java/src/main/java/ai/rapids/cudf/TableDebug.java
new file mode 100644
index 00000000000..18ec4b086df
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/TableDebug.java
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.rapids.cudf;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Locale;
+import java.util.function.Consumer;
+
+public class TableDebug {
+
+  /**
+   * Specify one of
+   * -Dai.rapids.cudf.debug.output=stderr       to print directly to standard error (default)
+   * -Dai.rapids.cudf.debug.output=stdout       to print directly to standard output
+   * -Dai.rapids.cudf.debug.output=log[_level]  to redirect to a logging subsystem that can
+   * further be
+   * configured.
+   * Supported log levels:
+   * debug (default)
+   * info
+   * warn
+   * error
+   */
+  public static final String OUTPUT_STREAM = "ai.rapids.cudf.debug.output";
+  private static final Logger log = LoggerFactory.getLogger(TableDebug.class);
+
+  public enum Output {
+    STDOUT(System.out::println),
+    STDERR(System.err::println),
+    LOG(log::debug),
+    LOG_DEBUG(log::debug),
+    LOG_INFO(log::info),
+    LOG_WARN(log::warn),
+    LOG_ERROR(log::error);
+
+    private final Consumer<String> printFunc;
+
+    Output(Consumer<String> pf) {
+      this.printFunc = pf;
+    }
+
+    final void println(String s) {
+      printFunc.accept(s);
+    }
+  }
+
+
+  public static class Builder {
+    private Output outputMode = Output.STDERR;
+
+    public Builder() {
+      try {
+        outputMode = Output.valueOf(
+            System.getProperty(OUTPUT_STREAM, Output.STDERR.name())
+                .toUpperCase(Locale.US));
+      } catch (Throwable e) {
+        log.warn("Failed to parse the output mode", e);
+      }
+    }
+
+    public Builder withOutput(Output outputMode) {
+      this.outputMode = outputMode;
+      return this;
+    }
+
+    public final TableDebug build() {
+      return new TableDebug(outputMode);
+    }
+  }
+
+  public static Builder builder() {
+    return new Builder();
+  }
+
+  private static final TableDebug DEFAULT_DEBUG = builder().build();
+
+  public static TableDebug get() {
+    return DEFAULT_DEBUG;
+  }
+
+  private final Output output;
+
+  private TableDebug(Output output) {
+    this.output = output;
+  }
+
+  /**
+   * Print the contents of a table. Note that this should never be
+   * called from production code, as it is very slow.  Also note that this is not production
+   * code.  You might need/want to update how the data shows up or add in support for more
+   * types as this really is just for debugging.
+   * @param name  the name of the table to print out.
+   * @param table the table to print out.
+   */
+  public synchronized void debug(String name, Table table) {
+    output.println("DEBUG " + name + " " + table);
+    for (int col = 0; col < table.getNumberOfColumns(); col++) {
+      debug(String.valueOf(col), table.getColumn(col));
+    }
+  }
+
+  /**
+   * Print the contents of a column. Note that this should never be
+   * called from production code, as it is very slow.  Also note that this is not production
+   * code.  You might need/want to update how the data shows up or add in support for more
+   * types as this really is just for debugging.
+   * @param name the name of the column to print out.
+   * @param col  the column to print out.
+   */
+  public synchronized void debug(String name, ColumnView col) {
+    debugGPUAddrs(name, col);
+    try (HostColumnVector hostCol = col.copyToHost()) {
+      debug(name, hostCol);
+    }
+  }
+
+  private synchronized void debugGPUAddrs(String name, ColumnView col) {
+    try (BaseDeviceMemoryBuffer data = col.getData();
+         BaseDeviceMemoryBuffer validity = col.getValid()) {
+      output.println("GPU COLUMN " + name + " - NC: " + col.getNullCount()
+          + " DATA: " + data + " VAL: " + validity);
+    }
+    if (col.getType() == DType.STRUCT) {
+      for (int i = 0; i < col.getNumChildren(); i++) {
+        try (ColumnView child = col.getChildColumnView(i)) {
+          debugGPUAddrs(name + ":CHILD_" + i, child);
+        }
+      }
+    } else if (col.getType() == DType.LIST) {
+      try (ColumnView child = col.getChildColumnView(0)) {
+        debugGPUAddrs(name + ":DATA", child);
+      }
+    }
+  }
+
+
+  /**
+   * Print the contents of a column. Note that this should never be
+   * called from production code, as it is very slow.  Also note that this is not production
+   * code.  You might need/want to update how the data shows up or add in support for more
+   * types as this really is just for debugging.
+   * @param name    the name of the column to print out.
+   * @param hostCol the column to print out.
+   */
+  public synchronized void debug(String name, HostColumnVectorCore hostCol) {
+    DType type = hostCol.getType();
+    output.println("COLUMN " + name + " - " + type);
+    if (type.isDecimalType()) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " " + hostCol.getBigDecimal(i));
+        }
+      }
+    } else if (DType.STRING.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " \"" + hostCol.getJavaString(i) + "\" " +
+              hexString(hostCol.getUTF8(i)));
+        }
+      }
+    } else if (DType.INT32.equals(type)
+        || DType.INT8.equals(type)
+        || DType.INT16.equals(type)
+        || DType.INT64.equals(type)
+        || DType.TIMESTAMP_DAYS.equals(type)
+        || DType.TIMESTAMP_SECONDS.equals(type)
+        || DType.TIMESTAMP_MICROSECONDS.equals(type)
+        || DType.TIMESTAMP_MILLISECONDS.equals(type)
+        || DType.TIMESTAMP_NANOSECONDS.equals(type)
+        || DType.UINT8.equals(type)
+        || DType.UINT16.equals(type)
+        || DType.UINT32.equals(type)
+        || DType.UINT64.equals(type)) {
+      debugInteger(hostCol, type);
+    } else if (DType.BOOL8.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " " + hostCol.getBoolean(i));
+        }
+      }
+    } else if (DType.FLOAT64.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " " + hostCol.getDouble(i));
+        }
+      }
+    } else if (DType.FLOAT32.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " " + hostCol.getFloat(i));
+        }
+      }
+    } else if (DType.STRUCT.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } // The struct child columns are printed out later on.
+      }
+      for (int i = 0; i < hostCol.getNumChildren(); i++) {
+        debug(name + ":CHILD_" + i, hostCol.getChildColumnView(i));
+      }
+    } else if (DType.LIST.equals(type)) {
+      output.println("OFFSETS");
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " [" + hostCol.getStartListOffset(i) + " - " +
+              hostCol.getEndListOffset(i) + ")");
+        }
+      }
+      debug(name + ":DATA", hostCol.getChildColumnView(0));
+    } else {
+      output.println("TYPE " + type + " NOT SUPPORTED FOR DEBUG PRINT");
+    }
+  }
+
+
+  private void debugInteger(HostColumnVectorCore hostCol, DType intType) {
+    for (int i = 0; i < hostCol.getRowCount(); i++) {
+      if (hostCol.isNull(i)) {
+        output.println(i + " NULL");
+      } else {
+        final int sizeInBytes = intType.getSizeInBytes();
+        final Object value;
+        switch (sizeInBytes) {
+          case Byte.BYTES:
+            value = hostCol.getByte(i);
+            break;
+          case Short.BYTES:
+            value = hostCol.getShort(i);
+            break;
+          case Integer.BYTES:
+            value = hostCol.getInt(i);
+            break;
+          case Long.BYTES:
+            value = hostCol.getLong(i);
+            break;
+          default:
+            throw new IllegalArgumentException("INFEASIBLE: Unsupported integer-like type " + intType);
+        }
+        output.println(i + " " + value);
+      }
+    }
+  }
+
+
+  private static String hexString(byte[] bytes) {
+    StringBuilder str = new StringBuilder();
+    for (byte b : bytes) {
+      str.append(String.format("%02x", b & 0xff));
+    }
+    return str.toString();
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 02a6b54c542..94de3c6a11c 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -8033,6 +8033,17 @@ void testParquetWriteToBufferChunkedInt96() {
           .withDecimalColumn("_c8", 5)
           .build();
 
+      TableDebug.get().debug("default stderr table0", table0);
+      TableDebug.builder()
+        .withOutput(TableDebug.Output.STDOUT)
+        .build().debug("stdout table0", table0);
+      TableDebug.builder()
+          .withOutput(TableDebug.Output.LOG)
+          .build().debug("slf4j default debug table0", table0);
+      TableDebug.builder()
+          .withOutput(TableDebug.Output.LOG_ERROR)
+          .build().debug("slf4j error table0", table0);
+
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
         writer.write(table0);
         writer.write(table0);

From 97d60c42baf9f4ba02cc95ecc3f5782030d65757 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 1 Aug 2023 12:56:09 -0500
Subject: [PATCH 016/230] Remove unnecessary pointer copying in JIT GroupBy
 Apply (#13792)

This PR removes some extra stores and loads that don't appear to be necessary in our groupby apply lowering which are possibly slowing things down. This came up during https://github.com/rapidsai/cudf/pull/13767.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13792
---
 python/cudf/cudf/core/udf/groupby_lowering.py | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py
index 376eccb9308..f82b5aae26c 100644
--- a/python/cudf/cudf/core/udf/groupby_lowering.py
+++ b/python/cudf/cudf/core/udf/groupby_lowering.py
@@ -37,10 +37,6 @@ def group_reduction_impl_basic(context, builder, sig, args, function):
     grp_type = sig.args[0]
     group_dataty = grp_type.group_data_type
 
-    # logically take the address of the group's data pointer
-    group_data_ptr = builder.alloca(grp.group_data.type)
-    builder.store(grp.group_data, group_data_ptr)
-
     # obtain the correct forward declaration from registry
     type_key = (sig.return_type, grp_type.group_scalar_type)
     func = call_cuda_functions[function][type_key]
@@ -51,7 +47,7 @@ def group_reduction_impl_basic(context, builder, sig, args, function):
         builder,
         func,
         nb_signature(retty, group_dataty, grp_type.group_size_type),
-        (builder.load(group_data_ptr), grp.size),
+        (grp.group_data, grp.size),
     )
 
 
@@ -95,13 +91,6 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function):
             "are supported."
         )
 
-    group_dataty = grp_type.group_data_type
-    group_data_ptr = builder.alloca(grp.group_data.type)
-    builder.store(grp.group_data, group_data_ptr)
-
-    index_dataty = grp_type.group_index_type
-    index_ptr = builder.alloca(grp.index.type)
-    builder.store(grp.index, index_ptr)
     type_key = (index_default_type, grp_type.group_scalar_type)
     func = call_cuda_functions[function][type_key]
 
@@ -109,9 +98,12 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function):
         builder,
         func,
         nb_signature(
-            retty, group_dataty, index_dataty, grp_type.group_size_type
+            retty,
+            grp_type.group_data_type,
+            grp_type.group_index_type,
+            grp_type.group_size_type,
         ),
-        (builder.load(group_data_ptr), builder.load(index_ptr), grp.size),
+        (grp.group_data, grp.index, grp.size),
     )
 
 
From fe307c13fc7456b0d5edfd23ab28d323a8b7fb43 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 1 Aug 2023 22:11:43 -0400
Subject: [PATCH 017/230] Support `corr` in `GroupBy.apply` through the jit
 engine (#13767)

This PR enables computing the pearson correlation between two columns of a group within a UDF. Concretely, syntax such as the following will be allowed and produce the same result as pandas:

```python
ans = df.groupby('key').apply(lambda group_df: group_df['x'].corr(group_df['y']))
```

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - https://github.com/brandon-b-miller

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13767
---
 python/cudf/cudf/core/udf/groupby_lowering.py | 41 ++++++++++
 python/cudf/cudf/core/udf/groupby_typing.py   | 69 +++++++++++-----
 python/cudf/cudf/core/udf/groupby_utils.py    |  1 -
 python/cudf/cudf/tests/test_groupby.py        | 16 ++++
 python/cudf/udf_cpp/shim.cu                   | 80 +++++++++++++++----
 5 files changed, 171 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py
index f82b5aae26c..fe0637cfaef 100644
--- a/python/cudf/cudf/core/udf/groupby_lowering.py
+++ b/python/cudf/cudf/core/udf/groupby_lowering.py
@@ -51,6 +51,46 @@ def group_reduction_impl_basic(context, builder, sig, args, function):
     )
 
 
+def group_corr(context, builder, sig, args):
+    """
+    Instruction boilerplate used for calling a groupby correlation
+    """
+    lhs_grp = cgutils.create_struct_proxy(sig.args[0])(
+        context, builder, value=args[0]
+    )
+    rhs_grp = cgutils.create_struct_proxy(sig.args[1])(
+        context, builder, value=args[1]
+    )
+
+    device_func = call_cuda_functions["corr"][
+        (
+            sig.return_type,
+            sig.args[0].group_scalar_type,
+            sig.args[1].group_scalar_type,
+        )
+    ]
+    result = context.compile_internal(
+        builder,
+        device_func,
+        nb_signature(
+            types.float64,
+            types.CPointer(
+                sig.args[0].group_scalar_type
+            ),  # this group calls corr
+            types.CPointer(
+                sig.args[1].group_scalar_type
+            ),  # this group is passed
+            group_size_type,
+        ),
+        (
+            lhs_grp.group_data,
+            rhs_grp.group_data,
+            lhs_grp.size,
+        ),
+    )
+    return result
+
+
 @lower_builtin(Group, types.Array, group_size_type, types.Array)
 def group_constructor(context, builder, sig, args):
     """
@@ -147,3 +187,4 @@ def cuda_Group_size(context, builder, sig, args):
     cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))(
         cuda_Group_idxmin
     )
+    cuda_lower("GroupType.corr", GroupType(ty), GroupType(ty))(group_corr)
diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py
index bc6a084f2b4..97afdd1c6ba 100644
--- a/python/cudf/cudf/core/udf/groupby_typing.py
+++ b/python/cudf/cudf/core/udf/groupby_typing.py
@@ -104,7 +104,22 @@ def __init__(self, dmm, fe_type):
 call_cuda_functions: Dict[Any, Any] = {}
 
 
-def _register_cuda_reduction_caller(funcname, inputty, retty):
+def _register_cuda_binary_reduction_caller(funcname, lty, rty, retty):
+    cuda_func = cuda.declare_device(
+        f"Block{funcname}_{lty}_{rty}",
+        retty(types.CPointer(lty), types.CPointer(rty), group_size_type),
+    )
+
+    def caller(lhs, rhs, size):
+        return cuda_func(lhs, rhs, size)
+
+    call_cuda_functions.setdefault(funcname.lower(), {})
+
+    type_key = retty, lty, rty
+    call_cuda_functions[funcname.lower()][type_key] = caller
+
+
+def _register_cuda_unary_reduction_caller(funcname, inputty, retty):
     cuda_func = cuda.declare_device(
         f"Block{funcname}_{inputty}",
         retty(types.CPointer(inputty), group_size_type),
@@ -191,6 +206,13 @@ def generic(self, args, kws):
         return nb_signature(self.this.index_type, recvr=self.this)
 
 
+class GroupCorr(AbstractTemplate):
+    key = "GroupType.corr"
+
+    def generic(self, args, kws):
+        return nb_signature(types.float64, args[0], recvr=self.this)
+
+
 @cuda_registry.register_attr
 class GroupAttr(AttributeTemplate):
     key = GroupType
@@ -220,33 +242,42 @@ def resolve_idxmin(self, mod):
             GroupIdxMin, GroupType(mod.group_scalar_type, mod.index_type)
         )
 
+    def resolve_corr(self, mod):
+        return types.BoundFunction(
+            GroupCorr, GroupType(mod.group_scalar_type, mod.index_type)
+        )
+
 
 for ty in SUPPORTED_GROUPBY_NUMBA_TYPES:
-    _register_cuda_reduction_caller("Max", ty, ty)
-    _register_cuda_reduction_caller("Min", ty, ty)
+    _register_cuda_unary_reduction_caller("Max", ty, ty)
+    _register_cuda_unary_reduction_caller("Min", ty, ty)
     _register_cuda_idx_reduction_caller("IdxMax", ty)
     _register_cuda_idx_reduction_caller("IdxMin", ty)
 
-_register_cuda_reduction_caller("Sum", types.int32, types.int64)
-_register_cuda_reduction_caller("Sum", types.int64, types.int64)
-_register_cuda_reduction_caller("Sum", types.float32, types.float32)
-_register_cuda_reduction_caller("Sum", types.float64, types.float64)
+    if ty in types.integer_domain:
+        _register_cuda_binary_reduction_caller("Corr", ty, ty, types.float64)
+
+
+_register_cuda_unary_reduction_caller("Sum", types.int32, types.int64)
+_register_cuda_unary_reduction_caller("Sum", types.int64, types.int64)
+_register_cuda_unary_reduction_caller("Sum", types.float32, types.float32)
+_register_cuda_unary_reduction_caller("Sum", types.float64, types.float64)
 
 
-_register_cuda_reduction_caller("Mean", types.int32, types.float64)
-_register_cuda_reduction_caller("Mean", types.int64, types.float64)
-_register_cuda_reduction_caller("Mean", types.float32, types.float32)
-_register_cuda_reduction_caller("Mean", types.float64, types.float64)
+_register_cuda_unary_reduction_caller("Mean", types.int32, types.float64)
+_register_cuda_unary_reduction_caller("Mean", types.int64, types.float64)
+_register_cuda_unary_reduction_caller("Mean", types.float32, types.float32)
+_register_cuda_unary_reduction_caller("Mean", types.float64, types.float64)
 
-_register_cuda_reduction_caller("Std", types.int32, types.float64)
-_register_cuda_reduction_caller("Std", types.int64, types.float64)
-_register_cuda_reduction_caller("Std", types.float32, types.float32)
-_register_cuda_reduction_caller("Std", types.float64, types.float64)
+_register_cuda_unary_reduction_caller("Std", types.int32, types.float64)
+_register_cuda_unary_reduction_caller("Std", types.int64, types.float64)
+_register_cuda_unary_reduction_caller("Std", types.float32, types.float32)
+_register_cuda_unary_reduction_caller("Std", types.float64, types.float64)
 
-_register_cuda_reduction_caller("Var", types.int32, types.float64)
-_register_cuda_reduction_caller("Var", types.int64, types.float64)
-_register_cuda_reduction_caller("Var", types.float32, types.float32)
-_register_cuda_reduction_caller("Var", types.float64, types.float64)
+_register_cuda_unary_reduction_caller("Var", types.int32, types.float64)
+_register_cuda_unary_reduction_caller("Var", types.int64, types.float64)
+_register_cuda_unary_reduction_caller("Var", types.float32, types.float32)
+_register_cuda_unary_reduction_caller("Var", types.float64, types.float64)
 
 
 for attr in ("group_data", "index", "size"):
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index ca72c28cd5f..b18720f5db5 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -124,7 +124,6 @@ def _get_groupby_apply_kernel(frame, func, args):
         "types": types,
     }
     kernel_string = _groupby_apply_kernel_string_from_template(frame, args)
-
     kernel = _get_kernel(kernel_string, global_exec_context, None, func)
 
     return kernel, return_type
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 48092be390d..7d22cb70803 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -389,6 +389,8 @@ def groupby_jit_data():
     df["key2"] = np.random.randint(0, 2, nelem)
     df["val1"] = np.random.random(nelem)
     df["val2"] = np.random.random(nelem)
+    df["val3"] = np.random.randint(0, 10, nelem)
+    df["val4"] = np.random.randint(0, 10, nelem)
     return df
 
 
@@ -433,6 +435,20 @@ def func(df):
     run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"])
 
 
+@pytest.mark.parametrize("dtype", ["int32", "int64"])
+def test_groupby_apply_jit_correlation(groupby_jit_data, dtype):
+
+    groupby_jit_data["val3"] = groupby_jit_data["val3"].astype(dtype)
+    groupby_jit_data["val4"] = groupby_jit_data["val4"].astype(dtype)
+
+    keys = ["key1", "key2"]
+
+    def func(group):
+        return group["val3"].corr(group["val4"])
+
+    run_groupby_apply_jit_test(groupby_jit_data, func, keys)
+
+
 @pytest.mark.parametrize("dtype", ["float64"])
 @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu
index a81c8238f76..0959b6ba53f 100644
--- a/python/cudf/udf_cpp/shim.cu
+++ b/python/cudf/udf_cpp/shim.cu
@@ -437,37 +437,55 @@ __device__ double BlockMean(T const* data, int64_t size)
 }
 
 template <typename T>
-__device__ double BlockVar(T const* data, int64_t size)
+__device__ double BlockCoVar(T const* lhs, T const* rhs, int64_t size)
 {
   auto block = cooperative_groups::this_thread_block();
 
-  __shared__ double block_var;
-  __shared__ T block_sum;
+  __shared__ double block_covar;
+
+  __shared__ T block_sum_lhs;
+  __shared__ T block_sum_rhs;
+
   if (block.thread_rank() == 0) {
-    block_var = 0;
-    block_sum = 0;
+    block_covar   = 0;
+    block_sum_lhs = 0;
+    block_sum_rhs = 0;
   }
   block.sync();
 
-  T local_sum      = 0;
-  double local_var = 0;
-
-  device_sum<T>(block, data, size, &block_sum);
+  device_sum<T>(block, lhs, size, &block_sum_lhs);
+  auto const mu_l = static_cast<double>(block_sum_lhs) / static_cast<double>(size);
+  auto const mu_r = [=]() {
+    if (lhs == rhs) {
+      // If the lhs and rhs are the same, this is calculating variance.
+      // Thus we can assume mu_r = mu_l.
+      return mu_l;
+    } else {
+      device_sum<T>(block, rhs, size, &block_sum_rhs);
+      return static_cast<double>(block_sum_rhs) / static_cast<double>(size);
+    }
+  }();
 
-  auto const mean = static_cast<double>(block_sum) / static_cast<double>(size);
+  double local_covar = 0;
 
   for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) {
-    auto const delta = static_cast<double>(data[idx]) - mean;
-    local_var += delta * delta;
+    local_covar += (static_cast<double>(lhs[idx]) - mu_l) * (static_cast<double>(rhs[idx]) - mu_r);
   }
 
-  cuda::atomic_ref<double, cuda::thread_scope_block> ref{block_var};
-  ref.fetch_add(local_var, cuda::std::memory_order_relaxed);
+  cuda::atomic_ref<double, cuda::thread_scope_block> ref{block_covar};
+  ref.fetch_add(local_covar, cuda::std::memory_order_relaxed);
   block.sync();
 
-  if (block.thread_rank() == 0) { block_var = block_var / static_cast<double>(size - 1); }
+  if (block.thread_rank() == 0) { block_covar /= static_cast<double>(size - 1); }
   block.sync();
-  return block_var;
+
+  return block_covar;
+}
+
+template <typename T>
+__device__ double BlockVar(T const* data, int64_t size)
+{
+  return BlockCoVar<T>(data, data, size);
 }
 
 template <typename T>
@@ -620,6 +638,19 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size)
   return block_idx_min;
 }
 
+template <typename T>
+__device__ double BlockCorr(T* const lhs_ptr, T* const rhs_ptr, int64_t size)
+{
+  auto numerator   = BlockCoVar(lhs_ptr, rhs_ptr, size);
+  auto denominator = BlockStd(lhs_ptr, size) * BlockStd<T>(rhs_ptr, size);
+
+  if (denominator == 0.0) {
+    return 0.0;
+  } else {
+    return numerator / denominator;
+  }
+}
+
 extern "C" {
 #define make_definition(name, cname, type, return_type)                                          \
   __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \
@@ -684,3 +715,20 @@ make_definition_idx(BlockIdxMax, float32, float);
 make_definition_idx(BlockIdxMax, float64, double);
 #undef make_definition_idx
 }
+
+extern "C" {
+#define make_definition_corr(name, cname, type)                                 \
+  __device__ int name##_##cname##_##cname(                                      \
+    double* numba_return_value, type* const lhs, type* const rhs, int64_t size) \
+  {                                                                             \
+    double const res    = name<type>(lhs, rhs, size);                           \
+    *numba_return_value = res;                                                  \
+    __syncthreads();                                                            \
+    return 0;                                                                   \
+  }
+
+make_definition_corr(BlockCorr, int32, int32_t);
+make_definition_corr(BlockCorr, int64, int64_t);
+
+#undef make_definition_corr
+}

From 5e8fd8e142991b2a9abdd2d11c1a1e957996597d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 2 Aug 2023 03:02:54 -0500
Subject: [PATCH 018/230] Remove hangs from trying to construct un-bounded
 sequences (#13799)

Fixes: #13049
This PR allows errors from pyarrow to be propagated when an un-bounded sequence is passed to `pa.array` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13799
---
 python/cudf/cudf/core/column/column.py | 7 ++++++-
 python/cudf/cudf/tests/test_series.py  | 9 +++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2f7cc4ba176..b4f3f533d44 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -5,6 +5,7 @@
 import builtins
 import pickle
 import warnings
+from collections import abc
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
@@ -2384,12 +2385,16 @@ def as_column(
                     return cudf.core.column.ListColumn.from_sequences(
                         arbitrary
                     )
-                else:
+                elif isinstance(arbitrary, abc.Iterable) or isinstance(
+                    arbitrary, abc.Sequence
+                ):
                     data = as_column(
                         _construct_array(arbitrary, dtype),
                         dtype=dtype,
                         nan_as_null=nan_as_null,
                     )
+                else:
+                    raise e
     return data
 
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8be1f431ab3..83d22bbca2d 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2204,3 +2204,12 @@ def test_series_contains(data, index):
     assert_eq(10 in ps, 10 in gs)
     assert_eq(True in ps, True in gs)
     assert_eq(False in ps, False in gs)
+
+
+def test_series_constructor_unbounded_sequence():
+    class A:
+        def __getitem__(self, key):
+            return 1
+
+    with pytest.raises(TypeError):
+        cudf.Series(A())

From c412480fc06ae5160cba9f5cdbd1153adb66dc79 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Wed, 2 Aug 2023 06:49:57 -0700
Subject: [PATCH 019/230] Cast only time of day to nanos to avoid an overflow
 in Parquet INT96 write (#13776)

Rework extraction of nanoseconds of the last day in INT96 write call path to avoid overflow. Contributes to NVIDIA/spark-rapids#8625

Fixes #8070

Authors:
  - Gera Shegalov (https://github.com/gerashegalov)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - MithunR (https://github.com/mythrocks)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13776
---
 cpp/src/io/parquet/page_enc.cu | 39 ++++++++++++++++++----------------
 cpp/tests/io/parquet_test.cpp  | 25 ++++++++++++++++++++++
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 5136cba3ac0..05f8bba7477 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -933,22 +933,24 @@ constexpr auto julian_calendar_epoch_diff()
 }
 
 /**
- * @brief Converts a timestamp_ns into a pair with nanoseconds since midnight and number of Julian
- * days. Does not deal with time zones. Used by INT96 code.
+ * @brief Converts number `v` of periods of type `PeriodT` into a pair with nanoseconds since
+ * midnight and number of Julian days. Does not deal with time zones. Used by INT96 code.
  *
- * @param ns number of nanoseconds since epoch
- * @return std::pair<nanoseconds,days> where nanoseconds is the number of nanoseconds
+ * @tparam PeriodT a ratio representing the tick period in duration
+ * @param v count of ticks since epoch
+ * @return A pair of (nanoseconds, days) where nanoseconds is the number of nanoseconds
  * elapsed in the day and days is the number of days from Julian epoch.
  */
-static __device__ std::pair<duration_ns, duration_D> convert_nanoseconds(timestamp_ns const ns)
+template <typename PeriodT>
+__device__ auto julian_days_with_time(int64_t v)
 {
   using namespace cuda::std::chrono;
-  auto const nanosecond_ticks = ns.time_since_epoch();
-  auto const gregorian_days   = floor<days>(nanosecond_ticks);
-  auto const julian_days      = gregorian_days + ceil<days>(julian_calendar_epoch_diff());
-
-  auto const last_day_ticks = nanosecond_ticks - gregorian_days;
-  return {last_day_ticks, julian_days};
+  auto const dur_total             = duration<int64_t, PeriodT>{v};
+  auto const dur_days              = floor<days>(dur_total);
+  auto const dur_time_of_day       = dur_total - dur_days;
+  auto const dur_time_of_day_nanos = duration_cast<nanoseconds>(dur_time_of_day);
+  auto const julian_days           = dur_days + ceil<days>(julian_calendar_epoch_diff());
+  return std::make_pair(dur_time_of_day_nanos, julian_days);
 }
 
 // blockDim(128, 1, 1)
@@ -1236,22 +1238,23 @@ __global__ void __launch_bounds__(128, 8)
               }
             }
 
-            auto const ret = convert_nanoseconds([&]() {
+            auto const [last_day_nanos, julian_days] = [&] {
+              using namespace cuda::std::chrono;
               switch (s->col.leaf_column->type().id()) {
                 case type_id::TIMESTAMP_SECONDS:
                 case type_id::TIMESTAMP_MILLISECONDS: {
-                  return timestamp_ns{duration_ms{v}};
+                  return julian_days_with_time<cuda::std::milli>(v);
                 } break;
                 case type_id::TIMESTAMP_MICROSECONDS:
                 case type_id::TIMESTAMP_NANOSECONDS: {
-                  return timestamp_ns{duration_us{v}};
+                  return julian_days_with_time<cuda::std::micro>(v);
                 } break;
               }
-              return timestamp_ns{duration_ns{0}};
-            }());
+              return julian_days_with_time<cuda::std::nano>(0);
+            }();
 
             // the 12 bytes of fixed length data.
-            v             = ret.first.count();
+            v             = last_day_nanos.count();
             dst[pos + 0]  = v;
             dst[pos + 1]  = v >> 8;
             dst[pos + 2]  = v >> 16;
@@ -1260,7 +1263,7 @@ __global__ void __launch_bounds__(128, 8)
             dst[pos + 5]  = v >> 40;
             dst[pos + 6]  = v >> 48;
             dst[pos + 7]  = v >> 56;
-            uint32_t w    = ret.second.count();
+            uint32_t w    = julian_days.count();
             dst[pos + 8]  = w;
             dst[pos + 9]  = w >> 8;
             dst[pos + 10] = w >> 16;
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 0ac3f659ffe..4e28f536728 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -38,6 +38,7 @@
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/span.hpp>
+#include <cudf/wrappers/timestamps.hpp>
 
 #include <src/io/parquet/compact_protocol_reader.hpp>
 #include <src/io/parquet/parquet.hpp>
@@ -6411,4 +6412,28 @@ TEST_F(ParquetReaderTest, FilterFloatNAN)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected1->view(), result1);
 }
 
+TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow)
+{
+  using namespace cuda::std::chrono;
+  using namespace cudf::io;
+
+  column_wrapper<cudf::timestamp_us> big_ts_col{
+    sys_days{year{3023} / month{7} / day{14}} + 7h + 38min + 45s + 418688us,
+    sys_days{year{723} / month{3} / day{21}} + 14h + 20min + 13s + microseconds{781ms}};
+
+  table_view expected({big_ts_col});
+  auto filepath = temp_env->get_temp_filepath("BigINT96Timestamp.parquet");
+
+  auto const out_opts =
+    parquet_writer_options::builder(sink_info{filepath}, expected).int96_timestamps(true).build();
+  write_parquet(out_opts);
+
+  auto const in_opts = parquet_reader_options::builder(source_info(filepath))
+                         .timestamp_type(cudf::data_type(cudf::type_id::TIMESTAMP_MICROSECONDS))
+                         .build();
+  auto const result = read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From d5265306afdf8d78f37b223ee2c6d9fbac874484 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 2 Aug 2023 11:41:47 -0400
Subject: [PATCH 020/230] Bug/update libcudf to handle arrow12 changes (#13794)

Contiuation of https://github.com/rapidsai/cudf/pull/13790 as more changes are needed to support Arrow 12 builds from source ( both static and shared ). This fixes issues when building against Arrow with S3 enabled, and corrects missing `acero` targets.

https://github.com/NVIDIA/spark-rapids-jni/issues/1306

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Gera Shegalov (https://github.com/gerashegalov)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13794
---
 cpp/cmake/thirdparty/get_arrow.cmake  | 28 ++++++++++++++++++++++++---
 cpp/tests/io/arrow_io_source_test.cpp | 14 +++++++++++++-
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 5934c8a2668..894dc9649e2 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -162,13 +162,14 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
   rapids_cpm_find(
     Arrow ${VERSION}
-    GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static
-                   arrow_dataset_static
+    GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static
+                   parquet_static arrow_acero_static arrow_dataset_static
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
     GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
     OPTIONS "CMAKE_VERBOSE_MAKEFILE ON"
+            "ARROW_ACERO ON"
             "ARROW_IPC ON"
             "ARROW_DATASET ON"
             "ARROW_WITH_BACKTRACE ON"
@@ -221,7 +222,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         # Set this to enable `find_package(Parquet)`
         set(Parquet_DIR "${Arrow_DIR}")
       endif()
-      # Set this to enable `find_package(ArrowDataset)`
+      # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for
+      # us
       set(ArrowDataset_DIR "${Arrow_DIR}")
       find_package(ArrowDataset REQUIRED QUIET)
     endif()
@@ -314,6 +316,26 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
     if(ENABLE_PARQUET)
 
+      set(arrow_acero_code_string
+          [=[
+              if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared))
+                  add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared)
+              endif()
+              if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static))
+                  add_library(arrow_acero_static ALIAS cudf::arrow_acero_static)
+              endif()
+            ]=]
+      )
+
+      rapids_export(
+        BUILD ArrowAcero
+        VERSION ${VERSION}
+        EXPORT_SET arrow_acero_targets
+        GLOBAL_TARGETS arrow_acero_shared arrow_acero_static
+        NAMESPACE cudf::
+        FINAL_CODE_BLOCK arrow_acero_code_string
+      )
+
       set(arrow_dataset_code_string
           [=[
               if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index d7f1879040b..fb9e20843ed 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -87,7 +87,19 @@ TEST_F(ArrowIOTest, S3FileSystem)
     ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
     ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
   }
-  CUDF_EXPECTS(arrow::fs::EnsureS3Finalized().ok(), "Failed to finalize s3 filesystem");
+  if (!s3_unsupported) {
+    // Verify that we are using Arrow with S3, and call finalize
+    // https://github.com/apache/arrow/issues/36974
+    // This needs to be in a separate conditional to ensure we call
+    // finalize after all arrow_io_source instances have been deleted.
+    void* whole_app                                       = dlopen(NULL, RTLD_LAZY);
+    decltype(arrow::fs::EnsureS3Finalized)* close_s3_func = nullptr;
+
+    close_s3_func = reinterpret_cast<decltype(close_s3_func)>(
+      dlsym(whole_app, "_ZN5arrow2fs17EnsureS3FinalizedEv"));
+    if (close_s3_func) { CUDF_EXPECTS(close_s3_func().ok(), "Failed to finalize s3 filesystem"); }
+    dlclose(whole_app);
+  }
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From f46cb31b602c7e23a9c08a64add28526f4ac7bf2 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 2 Aug 2023 20:00:35 -0400
Subject: [PATCH 021/230] Reduce `lists::contains` dispatches for scalars
 (#13805)

This PR is to prepare for https://github.com/rapidsai/cudf/pull/13672 which uses experimental comparators for both nested and non-nested types.

`lists::contains` currently has two APIs that support:

1. `cudf::scalar`
2. `cudf::column_view`

Both APIs currently need to call `cudf::type_dispatcher`. However, by converting `cudf::scalar` to a `cudf::column_view` by materializing all rows, we cut down the number of dispatches to half.

Compile times:
Before this PR: [12:07](https://downloads.rapids.ai/ci/cudf/pull-request/13788/0498f7d/cuda11_x86_64.ninja_log.html)
After this PR: [5:46](https://downloads.rapids.ai/ci/cudf/pull-request/13805/7664a98/cuda11_x86_64.ninja_log.html)

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13805
---
 cpp/src/lists/contains.cu | 142 ++++++++++++--------------------------
 1 file changed, 46 insertions(+), 96 deletions(-)

diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index a3293e36825..9d39f2f9a90 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/contains.hpp>
+#include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
@@ -154,16 +155,11 @@ struct search_list_nested_types_fn {
   duplicate_find_option const find_option;
   KeyValidityIter const key_validity_iter;
   EqComparator const d_comp;
-  bool const search_key_is_scalar;
 
   search_list_nested_types_fn(duplicate_find_option const find_option,
                               KeyValidityIter const key_validity_iter,
-                              EqComparator const& d_comp,
-                              bool search_key_is_scalar)
-    : find_option(find_option),
-      key_validity_iter(key_validity_iter),
-      d_comp(d_comp),
-      search_key_is_scalar(search_key_is_scalar)
+                              EqComparator const& d_comp)
+    : find_option(find_option), key_validity_iter(key_validity_iter), d_comp(d_comp)
   {
   }
 
@@ -186,9 +182,8 @@ struct search_list_nested_types_fn {
     auto const [begin, end] = element_index_pair_iter<forward>(list.size());
     auto const found_iter =
       thrust::find_if(thrust::seq, begin, end, [=] __device__(auto const idx) {
-        return !list.is_null(idx) &&
-               d_comp(static_cast<lhs_index_type>(list.element_offset(idx)),
-                      static_cast<rhs_index_type>(search_key_is_scalar ? 0 : list.row_index()));
+        return !list.is_null(idx) && d_comp(static_cast<lhs_index_type>(list.element_offset(idx)),
+                                            static_cast<rhs_index_type>(list.row_index()));
       });
     // If the key is found, return its found position in the list from `found_iter`.
     return found_iter == end ? NOT_FOUND_SENTINEL : *found_iter;
@@ -199,93 +194,53 @@ struct search_list_nested_types_fn {
  * @brief Function to search for key element(s) in the corresponding rows of a lists column,
  * specialized for non-nested types.
  */
-template <bool search_key_is_scalar,
-          typename Element,
-          typename InputIterator,
-          typename OutputIterator,
-          typename SearchKeyType>
+template <typename Element, typename InputIterator, typename OutputIterator>
 void index_of_non_nested_types(InputIterator input_it,
                                size_type num_rows,
                                OutputIterator output_it,
-                               SearchKeyType const& search_keys,
+                               column_view const& search_keys,
                                bool search_keys_have_nulls,
                                duplicate_find_option find_option,
                                rmm::cuda_stream_view stream)
 {
-  auto const do_search = [=](auto const keys_iter) {
-    thrust::transform(rmm::exec_policy(stream),
-                      input_it,
-                      input_it + num_rows,
-                      keys_iter,
-                      output_it,
-                      search_list_non_nested_types_fn{find_option});
-  };
-
-  if constexpr (search_key_is_scalar) {
-    auto const keys_iter = cudf::detail::make_optional_iterator<Element>(
-      search_keys, nullate::DYNAMIC{search_keys_have_nulls});
-    do_search(keys_iter);
-  } else {
-    auto const keys_cdv_ptr = column_device_view::create(search_keys, stream);
-    auto const keys_iter    = cudf::detail::make_optional_iterator<Element>(
-      *keys_cdv_ptr, nullate::DYNAMIC{search_keys_have_nulls});
-    do_search(keys_iter);
-  }
+  auto const keys_cdv_ptr = column_device_view::create(search_keys, stream);
+  auto const keys_iter    = cudf::detail::make_optional_iterator<Element>(
+    *keys_cdv_ptr, nullate::DYNAMIC{search_keys_have_nulls});
+  thrust::transform(rmm::exec_policy(stream),
+                    input_it,
+                    input_it + num_rows,
+                    keys_iter,
+                    output_it,
+                    search_list_non_nested_types_fn{find_option});
 }
 
 /**
  * @brief Function to search for index of key element(s) in the corresponding rows of a lists
  * column, specialized for nested types.
  */
-template <bool search_key_is_scalar,
-          typename InputIterator,
-          typename OutputIterator,
-          typename SearchKeyType>
+template <typename InputIterator, typename OutputIterator>
 void index_of_nested_types(InputIterator input_it,
                            size_type num_rows,
                            OutputIterator output_it,
                            column_view const& child,
-                           SearchKeyType const& search_keys,
+                           column_view const& search_keys,
                            duplicate_find_option find_option,
                            rmm::cuda_stream_view stream)
 {
-  // Create a `table_view` from the search key(s).
-  // If the input search key is a (nested type) scalar, a new column is materialized from that
-  // scalar before a `table_view` is generated from it. As such, the new created column will also be
-  // returned to keep the result `table_view` valid.
-  [[maybe_unused]] auto const [keys_tview, unused_column] =
-    [&]() -> std::pair<table_view, std::unique_ptr<column>> {
-    if constexpr (search_key_is_scalar) {
-      auto tmp_column = make_column_from_scalar(search_keys, 1, stream);
-      return {table_view{{tmp_column->view()}}, std::move(tmp_column)};
-    } else {
-      return {table_view{{search_keys}}, nullptr};
-    }
-  }();
-
+  auto const keys_tview  = cudf::table_view{{search_keys}};
   auto const child_tview = table_view{{child}};
   auto const has_nulls   = has_nested_nulls(child_tview) || has_nested_nulls(keys_tview);
   auto const comparator =
     cudf::experimental::row::equality::two_table_comparator(child_tview, keys_tview, stream);
   auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
 
-  auto const do_search = [=](auto const key_validity_iter) {
-    thrust::transform(
-      rmm::exec_policy(stream),
-      input_it,
-      input_it + num_rows,
-      output_it,
-      search_list_nested_types_fn{find_option, key_validity_iter, d_comp, search_key_is_scalar});
-  };
-
-  if constexpr (search_key_is_scalar) {
-    auto const key_validity_iter = cudf::detail::make_validity_iterator<true>(search_keys);
-    do_search(key_validity_iter);
-  } else {
-    auto const keys_dv_ptr       = column_device_view::create(search_keys, stream);
-    auto const key_validity_iter = cudf::detail::make_validity_iterator<true>(*keys_dv_ptr);
-    do_search(key_validity_iter);
-  }
+  auto const keys_dv_ptr       = column_device_view::create(search_keys, stream);
+  auto const key_validity_iter = cudf::detail::make_validity_iterator<true>(*keys_dv_ptr);
+  thrust::transform(rmm::exec_policy(stream),
+                    input_it,
+                    input_it + num_rows,
+                    output_it,
+                    search_list_nested_types_fn{find_option, key_validity_iter, d_comp});
 }
 
 /**
@@ -295,10 +250,10 @@ void index_of_nested_types(InputIterator input_it,
 struct dispatch_index_of {
   // SFINAE with conditional return type because we need to support device lambda in this function.
   // This is required due to a limitation of nvcc.
-  template <typename Element, typename SearchKeyType>
+  template <typename Element>
   std::enable_if_t<is_supported_type<Element>(), std::unique_ptr<column>> operator()(
     lists_column_view const& lists,
-    SearchKeyType const& search_keys,
+    column_view const& search_keys,
     duplicate_find_option find_option,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr) const
@@ -313,27 +268,10 @@ struct dispatch_index_of {
                  cudf::data_type_error);
     CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
 
-    auto constexpr search_key_is_scalar = std::is_same_v<SearchKeyType, cudf::scalar>;
-    auto const search_keys_have_nulls   = [&search_keys, stream] {
-      if constexpr (search_key_is_scalar) {
-        return !search_keys.is_valid(stream);
-      } else {
-        return search_keys.has_nulls();
-      }
-    }();
+    auto const search_keys_have_nulls = search_keys.has_nulls();
 
     auto const num_rows = lists.size();
 
-    if (search_key_is_scalar && search_keys_have_nulls) {
-      // If the scalar key is invalid/null, the entire output column will be all nulls.
-      return make_numeric_column(data_type{cudf::type_to_id<size_type>()},
-                                 num_rows,
-                                 cudf::create_null_mask(num_rows, mask_state::ALL_NULL, mr),
-                                 num_rows,
-                                 stream,
-                                 mr);
-    }
-
     auto const lists_cdv_ptr = column_device_view::create(lists.parent(), stream);
     auto const input_it      = cudf::detail::make_counting_transform_iterator(
       size_type{0},
@@ -346,11 +284,10 @@ struct dispatch_index_of {
     auto const output_it = out_positions->mutable_view().template begin<size_type>();
 
     if constexpr (not cudf::is_nested<Element>()) {
-      index_of_non_nested_types<search_key_is_scalar, Element>(
+      index_of_non_nested_types<Element>(
         input_it, num_rows, output_it, search_keys, search_keys_have_nulls, find_option, stream);
     } else {  // list + struct
-      index_of_nested_types<search_key_is_scalar>(
-        input_it, num_rows, output_it, child, search_keys, find_option, stream);
+      index_of_nested_types(input_it, num_rows, output_it, child, search_keys, find_option, stream);
     }
 
     if (search_keys_have_nulls || lists.has_nulls()) {
@@ -414,8 +351,21 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return cudf::type_dispatcher(
-    search_key.type(), dispatch_index_of{}, lists, search_key, find_option, stream, mr);
+  if (!search_key.is_valid(stream)) {
+    return make_numeric_column(data_type{cudf::type_to_id<size_type>()},
+                               lists.size(),
+                               cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
+                               lists.size(),
+                               stream,
+                               mr);
+  }
+  if (lists.size() == 0) {
+    return make_numeric_column(
+      data_type{type_to_id<size_type>()}, 0, cudf::mask_state::UNALLOCATED, stream, mr);
+  }
+
+  auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr);
+  return index_of(lists, search_key_col->view(), find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,

From 11fd25c25e8d593ddfb4e0d29281aaf63898d9ba Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 2 Aug 2023 19:59:04 -0500
Subject: [PATCH 022/230] Fix unbounded sequence issue in `DataFrame`
 constructor (#13811)

In `cudf`, we currently have a hang in this scenario:
```python
In [1]: import cudf

In [2]:     class A:
   ...:         def __getitem__(self, key):
   ...:             return 1
   ...:

In [3]: cudf.DataFrame([A()])
```

This PR introduces additional checks before letting the list-like inputs pass onto `itertools` for transposing.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13811
---
 python/cudf/cudf/core/dataframe.py       |  6 ++++++
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a510e6829d1..d421258b06b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -843,6 +843,12 @@ def _init_from_list_like(self, data, index=None, columns=None):
             data = DataFrame.from_pandas(pd.DataFrame(data))
             self._data = data._data
         else:
+            if any(
+                not isinstance(col, (abc.Iterable, abc.Sequence))
+                for col in data
+            ):
+                raise TypeError("Inputs should be an iterable or sequence.")
+
             data = list(itertools.zip_longest(*data))
 
             if columns is not None and len(data) == 0:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index aad0b757177..e35ab147bf4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10243,3 +10243,15 @@ def test_dataframe_init_columns_named_index():
     pdf = pd.DataFrame(data, columns=columns)
 
     assert_eq(gdf, pdf)
+
+
+def test_dataframe_constructor_unbounded_sequence():
+    class A:
+        def __getitem__(self, key):
+            return 1
+
+    with pytest.raises(TypeError):
+        cudf.DataFrame([A()])
+
+    with pytest.raises(TypeError):
+        cudf.DataFrame({"a": A()})

From 399efb960f689085bf671f6fa62916b1020e3b30 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 2 Aug 2023 23:50:47 -0700
Subject: [PATCH 023/230] Fix for Parquet writer when requested pages per row
 is smaller than fragment size (#13806)

#12685 introduced a bug in page calculation. If the `max_page_size_rows` parameter is set smaller than the page fragment size, the writer will produce a spurious empty page. This PR fixes this by only checking the fragment size if there are already rows in the page, and then returns the old check for number of rows exceeding the page limit.

Interestingly, libcudf can read these files with empty pages just fine, but parquet-mr cannot.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/13806
---
 cpp/src/io/parquet/page_enc.cu | 12 ++++++++---
 cpp/tests/io/parquet_test.cpp  | 38 ++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 05f8bba7477..190f70d0747 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -432,9 +432,15 @@ __global__ void __launch_bounds__(128)
                                 max_RLE_page_size(col_g.num_def_level_bits(), num_vals) +
                                   max_RLE_page_size(col_g.num_rep_level_bits(), num_vals));
 
-      if (num_rows >= ck_g.num_rows ||
-          (values_in_page > 0 && (page_size + fragment_data_size > this_max_page_size)) ||
-          rows_in_page + frag_g.num_rows > max_page_size_rows) {
+      // checks to see when we need to close the current page and start a new one
+      auto const is_last_chunk          = num_rows >= ck_g.num_rows;
+      auto const is_page_bytes_exceeded = page_size + fragment_data_size > this_max_page_size;
+      auto const is_page_rows_exceeded  = rows_in_page + frag_g.num_rows > max_page_size_rows;
+      // only check for limit overflow if there's already at least one fragment for this page
+      auto const is_page_too_big =
+        values_in_page > 0 && (is_page_bytes_exceeded || is_page_rows_exceeded);
+
+      if (is_last_chunk || is_page_too_big) {
         if (ck_g.use_dictionary) {
           // Additional byte to store entry bit width
           page_size = 1 + max_RLE_page_size(ck_g.dict_rle_bits, values_in_page);
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 4e28f536728..a5054daed19 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -3709,6 +3709,44 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   EXPECT_LE(ph.data_page_header.num_values, rows_per_page);
 }
 
+TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
+{
+  constexpr auto rows_per_page = 1'000;
+  constexpr auto fragment_size = 5'000;
+  constexpr auto num_rows      = 3 * rows_per_page;
+  const std::string s1(32, 'a');
+  auto col0_elements =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s1; });
+  auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + num_rows);
+
+  auto const expected = table_view{{col0}};
+
+  auto const filepath = temp_env->get_temp_filepath("CheckPageRowsTooSmall.parquet");
+  const cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .max_page_fragment_size(fragment_size)
+      .max_page_size_rows(rows_per_page);
+  cudf::io::write_parquet(out_opts);
+
+  // check that file is written correctly when rows/page < fragment size
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
+
+  read_footer(source, &fmd);
+  ASSERT_TRUE(fmd.row_groups.size() > 0);
+  ASSERT_TRUE(fmd.row_groups[0].columns.size() == 1);
+  auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data;
+  ASSERT_TRUE(first_chunk.data_page_offset > 0);
+
+  // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
+  // version should be smaller than size of the struct.
+  auto const ph = read_page_header(
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+
+  // there should be only one page since the fragment size is larger than rows_per_page
+  EXPECT_EQ(ph.data_page_header.num_values, num_rows);
+}
+
 TEST_F(ParquetWriterTest, Decimal128Stats)
 {
   // check that decimal128 min and max statistics are written in network byte order

From 9c559c94fcaa1525d7c95faf94e5486fcd992ef1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 3 Aug 2023 08:22:51 -0400
Subject: [PATCH 024/230] Remove the libcudf cudf::offset_type type (#13788)

Replace all occurrences  of `cudf::offset_type` with `cudf::size_type`
This helps clear up code where sizes are computed and then converted to offsets in-place.

Also, reference #13733

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - https://github.com/brandon-b-miller
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/13788
---
 cpp/benchmarks/copying/contiguous_split.cu    |  2 +-
 cpp/benchmarks/lists/copying/scatter_lists.cu | 12 ++---
 cpp/examples/strings/custom_prealloc.cu       |  2 +-
 .../cudf/detail/sizes_to_offsets_iterator.cuh |  4 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |  8 ++--
 cpp/include/cudf/lists/lists_column_view.hpp  |  6 +--
 .../cudf/strings/strings_column_view.hpp      |  6 +--
 .../cudf/tdigest/tdigest_column_view.hpp      |  4 +-
 cpp/include/cudf/types.hpp                    |  1 -
 cpp/include/cudf_test/column_utilities.hpp    |  4 +-
 cpp/include/cudf_test/column_wrapper.hpp      |  2 +-
 cpp/src/copying/concatenate.cu                |  6 +--
 cpp/src/copying/contiguous_split.cu           | 15 +++----
 cpp/src/groupby/groupby.cu                    |  2 +-
 cpp/src/groupby/sort/group_collect.cu         |  4 +-
 cpp/src/groupby/sort/group_merge_lists.cu     |  6 +--
 cpp/src/io/csv/writer_impl.cu                 |  8 ++--
 cpp/src/io/json/json_column.cu                |  4 +-
 cpp/src/io/json/write_json.cu                 | 12 ++---
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  4 +-
 cpp/src/io/statistics/statistics.cuh          |  2 +-
 .../combine/concatenate_list_elements.cu      | 12 +++--
 cpp/src/lists/combine/concatenate_rows.cu     | 19 ++++----
 cpp/src/lists/copying/scatter_helper.cu       | 12 ++---
 cpp/src/lists/interleave_columns.cu           | 24 +++++-----
 cpp/src/lists/lists_column_factories.cu       |  8 ++--
 cpp/src/lists/reverse.cu                      |  2 +-
 cpp/src/lists/sequences.cu                    | 10 ++---
 .../stream_compaction/apply_boolean_mask.cu   | 10 ++---
 cpp/src/lists/utilities.cu                    | 10 ++---
 cpp/src/quantiles/tdigest/tdigest.cu          | 16 +++----
 .../quantiles/tdigest/tdigest_aggregation.cu  | 44 +++++++++----------
 cpp/src/rolling/detail/nth_element.cuh        |  4 +-
 cpp/src/rolling/detail/rolling.cuh            |  2 +-
 .../rolling/detail/rolling_collect_list.cu    |  4 +-
 cpp/src/rolling/grouped_rolling.cu            |  4 +-
 cpp/src/strings/capitalize.cu                 | 12 ++---
 cpp/src/strings/combine/concatenate.cu        |  6 +--
 cpp/src/strings/combine/join_list_elements.cu |  6 +--
 cpp/src/strings/convert/convert_booleans.cu   |  2 +-
 cpp/src/strings/convert/convert_durations.cu  |  2 +-
 .../strings/convert/convert_fixed_point.cu    |  2 +-
 cpp/src/strings/convert/convert_hex.cu        |  4 +-
 cpp/src/strings/convert/convert_lists.cu      |  4 +-
 cpp/src/strings/convert/convert_urls.cu       | 19 ++++----
 cpp/src/strings/copying/concatenate.cu        |  4 +-
 cpp/src/strings/copying/shift.cu              | 18 ++++----
 cpp/src/strings/extract/extract_all.cu        |  8 ++--
 cpp/src/strings/json/json_path.cu             | 16 +++----
 cpp/src/strings/padding.cu                    |  4 +-
 cpp/src/strings/repeat_strings.cu             | 10 ++---
 cpp/src/strings/reverse.cu                    |  6 +--
 cpp/src/strings/search/find_multiple.cu       |  6 +--
 cpp/src/strings/search/findall.cu             |  6 +--
 cpp/src/strings/split/split_re.cu             | 12 ++---
 cpp/src/strings/strings_column_view.cpp       |  4 +-
 cpp/src/text/subword/bpe_tokenizer.cu         | 20 ++++-----
 cpp/src/text/subword/load_merges_file.cu      |  2 +-
 cpp/src/transform/row_bit_count.cu            | 12 ++---
 cpp/tests/column/factories_test.cpp           | 12 ++---
 cpp/tests/copying/concatenate_tests.cpp       |  8 ++--
 .../copying/copy_if_else_nested_tests.cpp     |  2 +-
 cpp/tests/copying/gather_struct_tests.cpp     |  4 +-
 cpp/tests/copying/get_value_tests.cpp         | 20 ++++-----
 .../copying/scatter_list_scalar_tests.cpp     |  8 ++--
 cpp/tests/copying/split_tests.cpp             |  6 +--
 cpp/tests/groupby/collect_list_tests.cpp      |  8 ++--
 cpp/tests/io/parquet_test.cpp                 | 12 ++---
 cpp/tests/lists/extract_tests.cpp             |  6 +--
 .../quantiles/percentile_approx_test.cpp      |  4 +-
 cpp/tests/reductions/tdigest_tests.cu         |  6 +--
 cpp/tests/strings/array_tests.cpp             |  2 +-
 cpp/tests/strings/contains_tests.cpp          |  6 +--
 cpp/tests/strings/factories_test.cu           |  8 ++--
 cpp/tests/transform/row_bit_count_test.cu     | 43 +++++++++---------
 cpp/tests/utilities/column_utilities.cu       |  6 +--
 cpp/tests/utilities/tdigest_utilities.cu      |  6 +--
 .../column_utilities_tests.cpp                |  2 +-
 java/src/main/native/src/ColumnViewJni.cu     |  6 +--
 java/src/main/native/src/row_conversion.cu    |  6 +--
 python/cudf/cudf/_lib/cpp/types.pxd           |  1 -
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |  4 +-
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |  4 +-
 83 files changed, 325 insertions(+), 345 deletions(-)

diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index cad5a7c7b58..910fc689c0b 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -151,7 +151,7 @@ void BM_contiguous_split_strings(benchmark::State& state, ContiguousSplitImpl& i
   }
 
   int64_t const total_bytes =
-    total_desired_bytes + ((num_rows + 1) * sizeof(cudf::offset_type)) +
+    total_desired_bytes + ((num_rows + 1) * sizeof(cudf::size_type)) +
     (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
                       : 0);
 
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index 85d730b94fb..dbc3234dabf 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -62,26 +62,26 @@ void BM_lists_scatter(::benchmark::State& state)
                    target_base_col->mutable_view().end<TypeParam>());
 
   auto source_offsets =
-    make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
+    make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                             num_rows + 1,
                             cudf::mask_state::UNALLOCATED,
                             stream,
                             mr);
   auto target_offsets =
-    make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
+    make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                             num_rows + 1,
                             cudf::mask_state::UNALLOCATED,
                             stream,
                             mr);
 
   thrust::sequence(rmm::exec_policy(stream),
-                   source_offsets->mutable_view().begin<cudf::offset_type>(),
-                   source_offsets->mutable_view().end<cudf::offset_type>(),
+                   source_offsets->mutable_view().begin<cudf::size_type>(),
+                   source_offsets->mutable_view().end<cudf::size_type>(),
                    0,
                    num_elements_per_row);
   thrust::sequence(rmm::exec_policy(stream),
-                   target_offsets->mutable_view().begin<cudf::offset_type>(),
-                   target_offsets->mutable_view().end<cudf::offset_type>(),
+                   target_offsets->mutable_view().begin<cudf::size_type>(),
+                   target_offsets->mutable_view().end<cudf::size_type>(),
                    0,
                    num_elements_per_row);
 
diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu
index a956550f505..0af4c47e947 100644
--- a/cpp/examples/strings/custom_prealloc.cu
+++ b/cpp/examples/strings/custom_prealloc.cu
@@ -41,7 +41,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
                               cudf::column_device_view const d_visibilities,
                               cudf::string_view redaction,
                               char* working_memory,
-                              cudf::offset_type const* d_offsets,
+                              cudf::size_type const* d_offsets,
                               cudf::string_view* d_output)
 {
   // The row index is resolved from the CUDA thread/block objects
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 532d66c8483..155b1ce5691 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -303,9 +303,9 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
 {
   auto count          = static_cast<size_type>(std::distance(begin, end));
   auto offsets_column = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, count + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, count + 1, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view = offsets_column->mutable_view();
-  auto d_offsets    = offsets_view.template data<offset_type>();
+  auto d_offsets    = offsets_view.template data<size_type>();
 
   // The number of offsets is count+1 so to build the offsets from the sizes
   // using exclusive-scan technically requires count+1 input values even though
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 18cb147d1e4..f04b2fda2bf 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -240,11 +240,11 @@ std::unique_ptr<column> scatter(scalar const& slr,
   rmm::device_buffer null_mask =
     slr_valid ? cudf::detail::create_null_mask(1, mask_state::UNALLOCATED, stream, mr)
               : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
-  auto offset_column = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
+  auto offset_column =
+    make_numeric_column(data_type{type_to_id<size_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
   thrust::sequence(rmm::exec_policy_nosync(stream),
-                   offset_column->mutable_view().begin<offset_type>(),
-                   offset_column->mutable_view().end<offset_type>(),
+                   offset_column->mutable_view().begin<size_type>(),
+                   offset_column->mutable_view().end<size_type>(),
                    0,
                    lv->view().size());
   auto wrapped = column_view(data_type{type_id::LIST},
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 60bdc654af6..8c6368eacb6 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -71,9 +71,7 @@ class lists_column_view : private column_view {
   using column_view::null_mask;
   using column_view::offset;
   using column_view::size;
-  static_assert(std::is_same_v<offset_type, size_type>,
-                "offset_type is expected to be the same as size_type.");
-  using offset_iterator = offset_type const*;  ///< Iterator type for offsets
+  using offset_iterator = size_type const*;  ///< Iterator type for offsets
 
   /**
    * @brief Returns the parent column.
@@ -119,7 +117,7 @@ class lists_column_view : private column_view {
    */
   [[nodiscard]] offset_iterator offsets_begin() const noexcept
   {
-    return offsets().begin<offset_type>() + offset();
+    return offsets().begin<size_type>() + offset();
   }
 
   /**
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index e617dbde024..f1aa8e49f00 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,8 +67,8 @@ class strings_column_view : private column_view {
   using column_view::offset;
   using column_view::size;
 
-  using offset_iterator = offset_type const*;  ///< offsets iterator type
-  using chars_iterator  = char const*;         ///< character iterator type
+  using offset_iterator = size_type const*;  ///< offsets iterator type
+  using chars_iterator  = char const*;       ///< character iterator type
 
   /**
    * @brief Returns the parent column.
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
index 89903c24c21..f2f493cbbe4 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
@@ -67,9 +67,7 @@ class tdigest_column_view : private column_view {
   tdigest_column_view& operator=(tdigest_column_view&&) = default;
 
   using column_view::size;
-  static_assert(std::is_same_v<offset_type, size_type>,
-                "offset_type is expected to be the same as size_type.");
-  using offset_iterator = offset_type const*;  ///< Iterator over offsets
+  using offset_iterator = size_type const*;  ///< Iterator over offsets
 
   // mean and weight column indices within tdigest inner struct columns
   static constexpr size_type mean_column_index{0};    ///< Mean column index
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 6991a90b31b..addab160b6e 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -80,7 +80,6 @@ class mutable_table_view;
 using size_type         = int32_t;   ///< Row index type for columns and tables
 using bitmask_type      = uint32_t;  ///< Bitmask type stored as 32-bit unsigned integer
 using valid_type        = uint8_t;   ///< Valid type in host memory
-using offset_type       = int32_t;   ///< Offset type for column offsets
 using thread_index_type = int64_t;   ///< Thread index type in kernels
 
 /**
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index f288c30e313..059bd10eae1 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -261,8 +261,8 @@ inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to
       cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
       cudf::get_default_stream());
     auto const h_offsets = cudf::detail::make_std_vector_sync(
-      cudf::device_span<cudf::offset_type const>(
-        scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
+      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
+                                               scv.size() + 1),
       cudf::get_default_stream());
 
     // build std::string vector from chars and offsets
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 9c1afc64550..1e311322de1 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1507,7 +1507,7 @@ class lists_column_wrapper : public detail::column_wrapper {
    */
   static lists_column_wrapper<T> make_one_empty_row_column(bool valid = true)
   {
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0};
     cudf::test::fixed_width_column_wrapper<int> values{};
     return lists_column_wrapper<T>(
       1,
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 170eccbcb09..a53ec295512 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -408,15 +408,15 @@ void traverse_children::operator()<cudf::string_view>(host_span<column_view cons
       return a + (scv.is_empty() ? 0
                   // if the column is unsliced, skip the offset retrieval.
                   : scv.offset() > 0
-                    ? cudf::detail::get_value<offset_type>(
+                    ? cudf::detail::get_value<size_type>(
                         scv.offsets(), scv.offset() + scv.size(), stream) -
-                        cudf::detail::get_value<offset_type>(scv.offsets(), scv.offset(), stream)
+                        cudf::detail::get_value<size_type>(scv.offsets(), scv.offset(), stream)
                   // if the offset() is 0, it can still be sliced to a shorter length. in this case
                   // we only need to read a single offset. otherwise just return the full length
                   // (chars_size())
                   : scv.size() + 1 == scv.offsets().size()
                     ? scv.chars_size()
-                    : cudf::detail::get_value<offset_type>(scv.offsets(), scv.size(), stream));
+                    : cudf::detail::get_value<size_type>(scv.offsets(), scv.size(), stream));
     });
   CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
                "Total number of concatenated chars exceeds the column size limit",
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 6c61af0050d..e1a55ec5419 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -939,7 +939,7 @@ struct batch_byte_size_function {
  * @brief Get the input buffer index given the output buffer index.
  */
 struct out_to_in_index_function {
-  offset_type const* const batch_offsets;
+  size_type const* const batch_offsets;
   int const num_bufs;
   __device__ int operator()(size_type i) const
   {
@@ -1312,7 +1312,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
  */
 struct chunk_iteration_state {
   chunk_iteration_state(rmm::device_uvector<dst_buf_info> _d_batched_dst_buf_info,
-                        rmm::device_uvector<offset_type> _d_batch_offsets,
+                        rmm::device_uvector<size_type> _d_batch_offsets,
                         std::vector<std::size_t>&& _h_num_buffs_per_iteration,
                         std::vector<std::size_t>&& _h_size_of_buffs_per_iteration,
                         std::size_t total_size)
@@ -1375,11 +1375,10 @@ struct chunk_iteration_state {
   bool has_more_copies() const { return current_iteration < num_iterations; }
 
   rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info;  ///< dst_buf_info per 1MB batch
-  rmm::device_uvector<offset_type> const
-    d_batch_offsets;             ///< Offset within a batch per dst_buf_info
-  std::size_t const total_size;  ///< The aggregate size of all iterations
-  int const num_iterations;      ///< The total number of iterations
-  int current_iteration;         ///< Marks the current iteration being worked on
+  rmm::device_uvector<size_type> const d_batch_offsets;  ///< Offset within a batch per dst_buf_info
+  std::size_t const total_size;                          ///< The aggregate size of all iterations
+  int const num_iterations;                              ///< The total number of iterations
+  int current_iteration;  ///< Marks the current iteration being worked on
 
  private:
   std::size_t starting_batch;  ///< Starting batch index for the current iteration
@@ -1398,7 +1397,7 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* temp_mr)
 {
-  rmm::device_uvector<offset_type> d_batch_offsets(num_bufs + 1, stream, temp_mr);
+  rmm::device_uvector<size_type> d_batch_offsets(num_bufs + 1, stream, temp_mr);
 
   auto const buf_count_iter = cudf::detail::make_counting_transform_iterator(
     0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) {
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index eddc748df7c..ce1fc71968f 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -107,7 +107,7 @@ struct empty_column_constructor {
 
     if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
       return make_lists_column(
-        0, make_empty_column(type_to_id<offset_type>()), empty_like(values), 0, {});
+        0, make_empty_column(type_to_id<size_type>()), empty_like(values), 0, {});
     }
 
     if constexpr (k == aggregation::Kind::RANK) {
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index c61a998a40c..f95ad72f453 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -96,12 +96,12 @@ std::unique_ptr<column> group_collect(column_view const& values,
   auto [child_column,
         offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] {
     auto offsets_column = make_numeric_column(
-      data_type(type_to_id<offset_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+      data_type(type_to_id<size_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
 
     thrust::copy(rmm::exec_policy(stream),
                  group_offsets.begin(),
                  group_offsets.end(),
-                 offsets_column->mutable_view().template begin<offset_type>());
+                 offsets_column->mutable_view().template begin<size_type>());
 
     // If column of grouped values contains null elements, and null_policy == EXCLUDE,
     // those elements must be filtered out, and offsets recomputed.
diff --git a/cpp/src/groupby/sort/group_merge_lists.cu b/cpp/src/groupby/sort/group_merge_lists.cu
index 3043d107635..2c72128dbfb 100644
--- a/cpp/src/groupby/sort/group_merge_lists.cu
+++ b/cpp/src/groupby/sort/group_merge_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                "Input to `group_merge_lists` must be a non-nullable lists column.");
 
   auto offsets_column = make_numeric_column(
-    data_type(type_to_id<offset_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_to_id<size_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
 
   // Generate offsets of the output lists column by gathering from the provided group offsets and
   // the input list offsets.
@@ -54,7 +54,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                  group_offsets.begin(),
                  group_offsets.end(),
                  lists_column_view(values).offsets_begin(),
-                 offsets_column->mutable_view().template begin<offset_type>());
+                 offsets_column->mutable_view().template begin<size_type>());
 
   // The child column of the output lists column is just copied from the input column.
   auto child_column =
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 12e9fccdee7..8c586306ad5 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -75,10 +75,10 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   string_view const d_delimiter;  // check for column delimiter
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
-  __device__ void write_char(char_utf8 chr, char*& d_buffer, offset_type& bytes)
+  __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
     if (d_buffer)
       d_buffer += cudf::strings::detail::from_char_utf8(chr, d_buffer);
@@ -105,8 +105,8 @@ struct escape_strings_fn {
         return chr == quote || chr == new_line || chr == d_delimiter[0];
       });
 
-    char* d_buffer    = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    offset_type bytes = 0;
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    size_type bytes = 0;
 
     if (quote_row) write_char(quote, d_buffer, bytes);
     for (auto chr : d_str) {
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 3a79d832d06..b18637c86d7 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -363,8 +363,8 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
       cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
       cudf::get_default_stream());
     auto const h_offsets = cudf::detail::make_std_vector_sync(
-      cudf::device_span<cudf::offset_type const>(
-        scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
+      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
+                                               scv.size() + 1),
       cudf::get_default_stream());
 
     // build std::string vector from chars and offsets
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 9ecf77a798a..6ee16f8866e 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -75,10 +75,10 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   bool const append_colon{false};
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
-  __device__ void write_char(char_utf8 chr, char*& d_buffer, offset_type& bytes)
+  __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
     if (d_buffer)
       d_buffer += cudf::strings::detail::from_char_utf8(chr, d_buffer);
@@ -91,7 +91,7 @@ struct escape_strings_fn {
     return nibble < 10 ? '0' + nibble : 'a' + nibble - 10;
   }
 
-  __device__ void write_utf8_codepoint(uint16_t codepoint, char*& d_buffer, offset_type& bytes)
+  __device__ void write_utf8_codepoint(uint16_t codepoint, char*& d_buffer, size_type& bytes)
   {
     if (d_buffer) {
       d_buffer[0] = '\\';
@@ -106,7 +106,7 @@ struct escape_strings_fn {
     }
   }
 
-  __device__ void write_utf16_codepoint(uint32_t codepoint, char*& d_buffer, offset_type& bytes)
+  __device__ void write_utf16_codepoint(uint32_t codepoint, char*& d_buffer, size_type& bytes)
   {
     constexpr uint16_t UTF16_HIGH_SURROGATE_BEGIN = 0xD800;
     constexpr uint16_t UTF16_LOW_SURROGATE_BEGIN  = 0xDC00;
@@ -130,8 +130,8 @@ struct escape_strings_fn {
     constexpr char_utf8 const quote = '\"';  // wrap quotes
     bool constexpr quote_row        = true;
 
-    char* d_buffer    = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    offset_type bytes = 0;
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    size_type bytes = 0;
 
     if (quote_row) write_char(quote, d_buffer, bytes);
     for (auto utf8_char : d_str) {
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 8c3bdabe6b4..c7e3de03312 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1016,7 +1016,7 @@ struct row_size_functor {
 template <>
 __device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
 {
-  auto const offset_size = sizeof(offset_type);
+  auto const offset_size = sizeof(size_type);
   // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
   // for the entire column, whereas this is adding an extra offset per page.  So we will get a
   // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
@@ -1036,7 +1036,7 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
 {
   // only returns the size of offsets and validity. the size of the actual string chars
   // is tracked separately.
-  auto const offset_size = sizeof(offset_type);
+  auto const offset_size = sizeof(size_type);
   // see note about offsets in the list_view template.
   return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
 }
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index 89b26fd731a..805ca43553e 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -132,7 +132,7 @@ __device__ T get_element(column_device_view const& col, uint32_t row)
 {
   using et              = typename T::element_type;
   size_type const index = row + col.offset();  // account for this view's _offset
-  auto const* d_offsets = col.child(lists_column_view::offsets_column_index).data<offset_type>();
+  auto const* d_offsets = col.child(lists_column_view::offsets_column_index).data<size_type>();
   auto const* d_data    = col.child(lists_column_view::child_column_index).data<et>();
   auto const offset     = d_offsets[index];
   return T(d_data + offset, d_offsets[index + 1] - offset);
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index fc3d8a9728a..3b00d7bd26e 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -53,11 +53,10 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
 {
   auto const num_rows = input.size();
 
-  static_assert(std::is_same_v<offset_type, int32_t> && std::is_same_v<size_type, int32_t>);
   auto out_offsets = make_numeric_column(
-    data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
 
-  auto const d_out_offsets  = out_offsets->mutable_view().template begin<offset_type>();
+  auto const d_out_offsets  = out_offsets->mutable_view().template begin<size_type>();
   auto const d_row_offsets  = lists_column_view(input).offsets_begin();
   auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
 
@@ -121,13 +120,12 @@ generate_list_offsets_and_validities(column_view const& input,
 {
   auto const num_rows = input.size();
 
-  static_assert(std::is_same_v<offset_type, int32_t> && std::is_same_v<size_type, int32_t>);
   auto out_offsets = make_numeric_column(
-    data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
 
   auto const lists_of_lists_dv_ptr = column_device_view::create(input, stream);
   auto const lists_dv_ptr   = column_device_view::create(lists_column_view(input).child(), stream);
-  auto const d_out_offsets  = out_offsets->mutable_view().template begin<offset_type>();
+  auto const d_out_offsets  = out_offsets->mutable_view().template begin<size_type>();
   auto const d_row_offsets  = lists_column_view(input).offsets_begin();
   auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
 
@@ -198,7 +196,7 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
      d_list_offsets,
      d_indices = gather_map.begin(),
      d_out_list_offsets =
-       output_list_offsets.template begin<offset_type>()] __device__(size_type const idx) {
+       output_list_offsets.template begin<size_type>()] __device__(size_type const idx) {
       // The output row has been identified as a null/empty list during list size computation.
       if (d_out_list_offsets[idx + 1] == d_out_list_offsets[idx]) { return; }
 
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 993d5e3fc78..658538b0195 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -77,11 +77,8 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   // outgoing offsets.
-  auto offsets = cudf::make_fixed_width_column(data_type{type_to_id<offset_type>()},
-                                               input.num_rows() + 1,
-                                               mask_state::UNALLOCATED,
-                                               stream,
-                                               mr);
+  auto offsets = cudf::make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, input.num_rows() + 1, mask_state::UNALLOCATED, stream, mr);
 
   auto keys = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
                                               [num_columns = input.num_columns()] __device__(
@@ -91,7 +88,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
   auto values = thrust::make_transform_iterator(
     thrust::make_counting_iterator(size_t{0}),
     [input, row_null_counts = row_null_counts.data(), null_policy] __device__(
-      size_t i) -> offset_type {
+      size_t i) -> size_type {
       auto const col_index = i % input.num_columns();
       auto const row_index = i / input.num_columns();
 
@@ -105,7 +102,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
         }
       }
       auto offsets =
-        input.column(col_index).child(lists_column_view::offsets_column_index).data<offset_type>() +
+        input.column(col_index).child(lists_column_view::offsets_column_index).data<size_type>() +
         input.column(col_index).offset();
       return offsets[row_index + 1] - offsets[row_index];
     });
@@ -115,13 +112,13 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
                         keys + (input.num_rows() * input.num_columns()),
                         values,
                         thrust::make_discard_iterator(),
-                        offsets->mutable_view().begin<offset_type>());
+                        offsets->mutable_view().begin<size_type>());
 
   // convert to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets->view().begin<offset_type>(),
-                         offsets->view().begin<offset_type>() + input.num_rows() + 1,
-                         offsets->mutable_view().begin<offset_type>(),
+                         offsets->view().begin<size_type>(),
+                         offsets->view().begin<size_type>() + input.num_rows() + 1,
+                         offsets->mutable_view().begin<size_type>(),
                          0);
 
   // generate appropriate null mask
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index 2cb0671c2dc..ca5358798c0 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -189,7 +189,7 @@ struct list_child_constructor {
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(child_column->size()),
       child_column->mutable_view().begin<T>(),
-      [offset_begin  = list_offsets.begin<offset_type>(),
+      [offset_begin  = list_offsets.begin<size_type>(),
        offset_size   = list_offsets.size(),
        d_list_vector = list_vector.begin(),
        source_lists,
@@ -241,7 +241,7 @@ struct list_child_constructor {
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(string_views.size()),
       string_views.begin(),
-      [offset_begin  = list_offsets.begin<offset_type>(),
+      [offset_begin  = list_offsets.begin<size_type>(),
        offset_size   = list_offsets.size(),
        d_list_vector = list_vector.begin(),
        source_lists,
@@ -255,7 +255,7 @@ struct list_child_constructor {
         auto row_index         = d_list_vector[list_index].row_index();
         auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists);
         auto lists_column    = actual_list_row.get_column();
-        auto lists_offsets_ptr    = lists_column.offsets().template data<offset_type>();
+        auto lists_offsets_ptr    = lists_column.offsets().template data<size_type>();
         auto child_strings_column = lists_column.child();
         auto strings_offset       = lists_offsets_ptr[row_index] + intra_index;
 
@@ -308,7 +308,7 @@ struct list_child_constructor {
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(child_list_views.size()),
       child_list_views.begin(),
-      [offset_begin  = list_offsets.begin<offset_type>(),
+      [offset_begin  = list_offsets.begin<size_type>(),
        offset_size   = list_offsets.size(),
        d_list_vector = list_vector.begin(),
        source_lists,
@@ -323,10 +323,10 @@ struct list_child_constructor {
         auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists);
         auto lists_column    = actual_list_row.get_column();
         auto child_lists_column = lists_column.child();
-        auto lists_offsets_ptr  = lists_column.offsets().template data<offset_type>();
+        auto lists_offsets_ptr  = lists_column.offsets().template data<size_type>();
         auto child_lists_offsets_ptr =
           child_lists_column.child(lists_column_view::offsets_column_index)
-            .template data<offset_type>();
+            .template data<size_type>();
         auto child_row_index = lists_offsets_ptr[row_index] + intra_index;
         auto size =
           child_lists_offsets_ptr[child_row_index + 1] - child_lists_offsets_ptr[child_row_index];
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index f76aaadaf7b..e80d63939ea 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -58,11 +58,9 @@ generate_list_offsets_and_validities(table_view const& input,
   auto const table_dv_ptr     = table_device_view::create(input, stream);
 
   // The output offsets column.
-  static_assert(sizeof(offset_type) == sizeof(int32_t));
-  static_assert(sizeof(size_type) == sizeof(int32_t));
   auto list_offsets = make_numeric_column(
-    data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr);
-  auto const d_offsets = list_offsets->mutable_view().template begin<offset_type>();
+    data_type{type_to_id<size_type>()}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr);
+  auto const d_offsets = list_offsets->mutable_view().template begin<size_type>();
 
   // The array of int8_t to store validities for list elements.
   auto validities = rmm::device_uvector<int8_t>(has_null_mask ? num_output_lists : 0, stream);
@@ -82,7 +80,7 @@ generate_list_offsets_and_validities(table_view const& input,
       auto const& lists_col = table_dv.column(col_id);
       if (has_null_mask) { d_validities[idx] = static_cast<int8_t>(lists_col.is_valid(list_id)); }
       auto const list_offsets =
-        lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+        lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
         lists_col.offset();
       return list_offsets[list_id + 1] - list_offsets[list_id];
     });
@@ -139,13 +137,13 @@ struct compute_string_sizes_and_interleave_lists_fn {
   table_device_view const table_dv;
 
   // Store list offsets of the output lists column.
-  offset_type const* const dst_list_offsets;
+  size_type const* const dst_list_offsets;
 
   // Flag to specify whether to compute string validities.
   bool const has_null_mask;
 
   // Store offsets of the strings.
-  offset_type* d_offsets{nullptr};
+  size_type* d_offsets{nullptr};
 
   // If d_chars == nullptr: only compute sizes and validities of the output strings.
   // If d_chars != nullptr: only interleave lists of strings.
@@ -164,11 +162,11 @@ struct compute_string_sizes_and_interleave_lists_fn {
     if (has_null_mask and lists_col.is_null(list_id)) { return; }
 
     auto const list_offsets =
-      lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+      lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
       lists_col.offset();
     auto const& str_col = lists_col.child(lists_column_view::child_column_index);
     auto const str_offsets =
-      str_col.child(strings_column_view::offsets_column_index).template data<offset_type>();
+      str_col.child(strings_column_view::offsets_column_index).template data<size_type>();
 
     // The range of indices of the strings within the source list.
     auto const start_str_idx = list_offsets[list_id];
@@ -224,7 +222,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
   {
     auto const table_dv_ptr = table_device_view::create(input, stream);
     auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
-      *table_dv_ptr, output_list_offsets.template begin<offset_type>(), data_has_null_mask};
+      *table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};
 
     auto validities =
       rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
@@ -276,14 +274,14 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
       [num_cols,
        table_dv     = *table_dv_ptr,
        d_validities = validities.begin(),
-       d_offsets    = output_list_offsets.template begin<offset_type>(),
+       d_offsets    = output_list_offsets.template begin<size_type>(),
        d_output     = output_dv_ptr->template begin<T>(),
        data_has_null_mask] __device__(size_type const idx) {
         auto const col_id     = idx % num_cols;
         auto const list_id    = idx / num_cols;
         auto const& lists_col = table_dv.column(col_id);
         auto const list_offsets =
-          lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+          lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
           lists_col.offset();
         auto const& data_col = lists_col.child(lists_column_view::child_column_index);
 
@@ -384,7 +382,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   // specialized for different types.
   auto const num_output_lists = input.num_rows() * input.num_columns();
   auto const num_output_entries =
-    cudf::detail::get_value<offset_type>(offsets_view, num_output_lists, stream);
+    cudf::detail::get_value<size_type>(offsets_view, num_output_lists, stream);
   auto const data_has_null_mask =
     std::any_of(std::cbegin(input), std::cend(input), [](auto const& col) {
       return col.child(lists_column_view::child_column_index).nullable();
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 754735f5a5b..7f82d32d327 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -39,7 +39,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
 {
   if (size == 0) {
     return make_lists_column(0,
-                             make_empty_column(type_to_id<offset_type>()),
+                             make_empty_column(type_to_id<size_type>()),
                              empty_like(value.view()),
                              0,
                              cudf::detail::create_null_mask(0, mask_state::UNALLOCATED, stream, mr),
@@ -50,7 +50,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
 
   // Handcraft a 1-row column
   auto offsets = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr_final);
+    data_type{type_to_id<size_type>()}, 2, mask_state::UNALLOCATED, stream, mr_final);
   auto m_offsets = offsets->mutable_view();
   thrust::sequence(rmm::exec_policy(stream),
                    m_offsets.begin<size_type>(),
@@ -90,7 +90,7 @@ std::unique_ptr<column> make_empty_lists_column(data_type child_type,
                                                 rmm::cuda_stream_view stream,
                                                 rmm::mr::device_memory_resource* mr)
 {
-  auto offsets = make_empty_column(data_type(type_to_id<offset_type>()));
+  auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
   auto child   = make_empty_column(child_type);
   return make_lists_column(
     0, std::move(offsets), std::move(child), 0, rmm::device_buffer{}, stream, mr);
@@ -103,7 +103,7 @@ std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
 {
   auto offsets = [&] {
     auto offsets_buff =
-      cudf::detail::make_zeroed_device_uvector_async<offset_type>(size + 1, stream, mr);
+      cudf::detail::make_zeroed_device_uvector_async<size_type>(size + 1, stream, mr);
     return std::make_unique<column>(std::move(offsets_buff), rmm::device_buffer{}, 0);
   }();
   auto child     = make_empty_column(child_type);
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index d606f11bdb9..a2af85b5dad 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -56,7 +56,7 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::counting_iterator<size_type>(0),
                      child.size(),
-                     [list_offsets = out_offsets->view().begin<offset_type>(),
+                     [list_offsets = out_offsets->view().begin<size_type>(),
                       list_indices = labels->view().begin<size_type>(),
                       gather_map   = gather_map.begin()] __device__(auto const idx) {
                        auto const list_idx     = list_indices[idx];
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index 8e1e6c37a95..aaee5608cc3 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -47,7 +47,7 @@ struct tabulator {
 
   T const* const starts;
   T const* const steps;
-  offset_type const* const offsets;
+  size_type const* const offsets;
 
   template <typename U>
   static std::enable_if_t<!cudf::is_duration<U>(), T> __device__ multiply(U x, size_type times)
@@ -86,7 +86,7 @@ struct sequences_dispatcher {
                                      size_type n_elements,
                                      column_view const& starts,
                                      std::optional<column_view> const& steps,
-                                     offset_type const* offsets,
+                                     size_type const* offsets,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
@@ -106,7 +106,7 @@ struct sequences_functor<T, std::enable_if_t<is_supported<T>()>> {
                                         size_type n_elements,
                                         column_view const& starts,
                                         std::optional<column_view> const& steps,
-                                        offset_type const* offsets,
+                                        size_type const* offsets,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
@@ -154,8 +154,8 @@ std::unique_ptr<column> sequences(column_view const& starts,
 
   // Generate list offsets for the output.
   auto list_offsets = make_numeric_column(
-    data_type(type_to_id<offset_type>()), n_lists + 1, mask_state::UNALLOCATED, stream, mr);
-  auto const offsets_begin  = list_offsets->mutable_view().template begin<offset_type>();
+    data_type(type_to_id<size_type>()), n_lists + 1, mask_state::UNALLOCATED, stream, mr);
+  auto const offsets_begin  = list_offsets->mutable_view().template begin<size_type>();
   auto const sizes_input_it = cudf::detail::indexalator_factory::make_input_iterator(sizes);
   // First copy the sizes since the exclusive_scan tries to read (n_lists+1) values
   thrust::copy_n(rmm::exec_policy(stream), sizes_input_it, sizes.size(), offsets_begin);
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index 0aaa8356304..ad43fbd5b00 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -74,7 +74,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                              stream,
                                              rmm::mr::get_current_device_resource());
     auto const d_sizes     = column_device_view::create(*sizes, stream);
-    auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, offset_type{0});
+    auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, size_type{0});
     auto const sizes_end   = sizes_begin + sizes->size();
     auto output_offsets    = cudf::make_numeric_column(
       offset_data_type, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
@@ -82,12 +82,10 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
 
     // Could have attempted an exclusive_scan(), but it would not compute the last entry.
     // Instead, inclusive_scan(), followed by writing `0` to the head of the offsets column.
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           sizes_begin,
-                           sizes_end,
-                           output_offsets_view.begin<offset_type>() + 1);
+    thrust::inclusive_scan(
+      rmm::exec_policy(stream), sizes_begin, sizes_end, output_offsets_view.begin<size_type>() + 1);
     CUDF_CUDA_TRY(cudaMemsetAsync(
-      output_offsets_view.begin<offset_type>(), 0, sizeof(offset_type), stream.value()));
+      output_offsets_view.begin<size_type>(), 0, sizeof(size_type), stream.value()));
     return output_offsets;
   };
 
diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu
index 50a41c51f76..2c4966c969e 100644
--- a/cpp/src/lists/utilities.cu
+++ b/cpp/src/lists/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,10 +42,10 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
 
 {
   auto out_offsets = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr);
 
   auto const labels_begin  = labels.template begin<size_type>();
-  auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>();
+  auto const offsets_begin = out_offsets->mutable_view().template begin<size_type>();
   cudf::detail::labels_to_offsets(labels_begin,
                                   labels_begin + labels.size(),
                                   offsets_begin,
@@ -60,7 +60,7 @@ std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
 {
   if (input.is_empty()) { return empty_like(input.offsets()); }
 
-  auto out_offsets = make_numeric_column(data_type(type_to_id<offset_type>()),
+  auto out_offsets = make_numeric_column(data_type(type_to_id<size_type>()),
                                          input.size() + 1,
                                          cudf::mask_state::UNALLOCATED,
                                          stream,
@@ -68,7 +68,7 @@ std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
   thrust::transform(rmm::exec_policy(stream),
                     input.offsets_begin(),
                     input.offsets_end(),
-                    out_offsets->mutable_view().begin<offset_type>(),
+                    out_offsets->mutable_view().begin<size_type>(),
                     [d_offsets = input.offsets_begin()] __device__(auto const offset_val) {
                       // The first offset value, used for zero-normalizing offsets.
                       return offset_val - *d_offsets;
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 0c90b0af8d2..cfdb386ff64 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ struct make_centroid {
 
 // kernel for computing percentiles on input tdigest (mean, weight) centroid data.
 template <typename CentroidIter>
-__global__ void compute_percentiles_kernel(device_span<offset_type const> tdigest_offsets,
+__global__ void compute_percentiles_kernel(device_span<size_type const> tdigest_offsets,
                                            column_device_view percentiles,
                                            CentroidIter centroids_,
                                            double const* min_,
@@ -199,8 +199,8 @@ std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& in
                                                           rmm::mr::get_current_device_resource());
   auto keys               = cudf::detail::make_counting_transform_iterator(
     0,
-    [offsets_begin = offsets.begin<offset_type>(),
-     offsets_end   = offsets.end<offset_type>()] __device__(size_type i) {
+    [offsets_begin = offsets.begin<size_type>(),
+     offsets_end   = offsets.end<size_type>()] __device__(size_type i) {
       return thrust::distance(
         offsets_begin,
         thrust::prev(thrust::upper_bound(thrust::seq, offsets_begin, offsets_end, i)));
@@ -239,7 +239,7 @@ std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& in
   constexpr size_type block_size = 256;
   cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size);
   compute_percentiles_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
-    {offsets.begin<offset_type>(), static_cast<size_t>(offsets.size())},
+    {offsets.begin<size_type>(), static_cast<size_t>(offsets.size())},
     *percentiles_cdv,
     centroids,
     tdv.min_begin(),
@@ -294,8 +294,8 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
   auto offsets = cudf::make_fixed_width_column(
     data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
-               offsets->mutable_view().begin<offset_type>(),
-               offsets->mutable_view().end<offset_type>(),
+               offsets->mutable_view().begin<size_type>(),
+               offsets->mutable_view().end<size_type>(),
                0);
 
   auto min_col =
@@ -362,7 +362,7 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
   thrust::exclusive_scan(rmm::exec_policy(stream),
                          row_size_iter,
                          row_size_iter + input.size() + 1,
-                         offsets->mutable_view().begin<offset_type>());
+                         offsets->mutable_view().begin<size_type>());
 
   if (percentiles.size() == 0 || all_empty_rows) {
     return cudf::make_lists_column(
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 35f2ce05bb6..2ce55e10fb1 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -127,7 +127,7 @@ struct merge_centroids {
  * nearest whole number <= it is floor(3.56) == 3.
  */
 struct nearest_value_scalar_weights_grouped {
-  offset_type const* group_offsets;
+  size_type const* group_offsets;
 
   thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index) const
   {
@@ -167,8 +167,8 @@ struct nearest_value_scalar_weights {
 template <typename GroupOffsetsIter>
 struct nearest_value_centroid_weights {
   double const* cumulative_weights;
-  GroupOffsetsIter outer_offsets;    // groups
-  offset_type const* inner_offsets;  // tdigests within a group
+  GroupOffsetsIter outer_offsets;  // groups
+  size_type const* inner_offsets;  // tdigests within a group
 
   thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index) const
   {
@@ -234,8 +234,8 @@ template <typename GroupLabelsIter, typename GroupOffsetsIter>
 struct cumulative_centroid_weight {
   double const* cumulative_weights;
   GroupLabelsIter group_labels;
-  GroupOffsetsIter outer_offsets;                      // groups
-  cudf::device_span<offset_type const> inner_offsets;  // tdigests with a group
+  GroupOffsetsIter outer_offsets;                    // groups
+  cudf::device_span<size_type const> inner_offsets;  // tdigests with a group
 
   std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
   {
@@ -257,7 +257,7 @@ struct cumulative_centroid_weight {
 // retrieve group info (total weight, size, start offset) of scalar inputs by group index.
 struct scalar_group_info_grouped {
   size_type const* group_valid_counts;
-  offset_type const* group_offsets;
+  size_type const* group_offsets;
 
   __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index) const
   {
@@ -283,7 +283,7 @@ template <typename GroupOffsetsIter>
 struct centroid_group_info {
   double const* cumulative_weights;
   GroupOffsetsIter outer_offsets;
-  offset_type const* inner_offsets;
+  size_type const* inner_offsets;
 
   __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index) const
   {
@@ -375,7 +375,7 @@ __global__ void generate_cluster_limits_kernel(int delta,
                                                CumulativeWeight cumulative_weight,
                                                double* group_cluster_wl,
                                                size_type* group_num_clusters,
-                                               offset_type const* group_cluster_offsets,
+                                               size_type const* group_cluster_offsets,
                                                bool has_nulls)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -544,12 +544,12 @@ generate_group_cluster_info(int delta,
   thrust::exclusive_scan(rmm::exec_policy(stream),
                          cluster_size,
                          cluster_size + num_groups + 1,
-                         group_cluster_offsets->mutable_view().begin<offset_type>(),
+                         group_cluster_offsets->mutable_view().begin<size_type>(),
                          0);
 
   // total # of clusters
-  offset_type total_clusters =
-    cudf::detail::get_value<offset_type>(group_cluster_offsets->view(), num_groups, stream);
+  size_type total_clusters =
+    cudf::detail::get_value<size_type>(group_cluster_offsets->view(), num_groups, stream);
 
   // fill in the actual cluster weight limits
   rmm::device_uvector<double> group_cluster_wl(total_clusters, stream);
@@ -561,7 +561,7 @@ generate_group_cluster_info(int delta,
     cumulative_weight,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
-    group_cluster_offsets->view().begin<offset_type>(),
+    group_cluster_offsets->view().begin<size_type>(),
     has_nulls);
 
   return {std::move(group_cluster_wl),
@@ -584,7 +584,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
     return weights[i] == 0;
   };
   // whether or not this particular tdigest is a stub
-  auto is_stub_digest = [offsets = offsets->view().begin<offset_type>(), is_stub_weight] __device__(
+  auto is_stub_digest = [offsets = offsets->view().begin<size_type>(), is_stub_weight] __device__(
                           size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
 
   size_type const num_stubs = [&]() {
@@ -622,12 +622,12 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
   auto _weights = remove_stubs(*weights, num_stubs);
 
   // adjust offsets.
-  rmm::device_uvector<offset_type> sizes(num_rows, stream);
+  rmm::device_uvector<size_type> sizes(num_rows, stream);
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator(0),
                     thrust::make_counting_iterator(0) + num_rows,
                     sizes.begin(),
-                    [offsets = offsets->view().begin<offset_type>()] __device__(size_type i) {
+                    [offsets = offsets->view().begin<size_type>()] __device__(size_type i) {
                       return offsets[i + 1] - offsets[i];
                     });
   auto iter = cudf::detail::make_counting_transform_iterator(
@@ -637,7 +637,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
   thrust::exclusive_scan(rmm::exec_policy(stream),
                          iter,
                          iter + num_rows + 1,
-                         offsets->mutable_view().begin<offset_type>(),
+                         offsets->mutable_view().begin<size_type>(),
                          0);
 
   // assemble final column
@@ -717,7 +717,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
     thrust::make_counting_iterator(0),
     [delta,
      group_cluster_wl      = group_cluster_wl.data(),
-     group_cluster_offsets = group_cluster_offsets->view().begin<offset_type>(),
+     group_cluster_offsets = group_cluster_offsets->view().begin<size_type>(),
      group_cumulative_weight] __device__(size_type value_index) -> size_type {
       // get group index, relative value index within the group and cumulative weight.
       [[maybe_unused]] auto [group_index, relative_value_index, cumulative_weight] =
@@ -1018,10 +1018,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   // bring tdigest offsets back to the host
   auto tdigest_offsets = tdv.centroids().offsets();
-  std::vector<offset_type> h_inner_offsets(tdigest_offsets.size());
+  std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
   cudaMemcpyAsync(h_inner_offsets.data(),
-                  tdigest_offsets.begin<offset_type>(),
-                  sizeof(offset_type) * tdigest_offsets.size(),
+                  tdigest_offsets.begin<size_type>(),
+                  sizeof(size_type) * tdigest_offsets.size(),
                   cudaMemcpyDefault,
                   stream);
 
@@ -1154,7 +1154,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       cumulative_weights->view().begin<double>(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<offset_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
     false,
     stream,
     mr);
@@ -1174,7 +1174,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       cumulative_weights->view().begin<double>(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<offset_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
     std::move(merged_min_col),
     std::move(merged_max_col),
     group_cluster_wl,
diff --git a/cpp/src/rolling/detail/nth_element.cuh b/cpp/src/rolling/detail/nth_element.cuh
index c28d96e7793..bd3cbb39168 100644
--- a/cpp/src/rolling/detail/nth_element.cuh
+++ b/cpp/src/rolling/detail/nth_element.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -157,7 +157,7 @@ std::unique_ptr<column> nth_element(size_type n,
     gather_index_calculator<null_handling, PrecedingIter, FollowingIter>{
       n, input, preceding, following, min_periods, stream});
 
-  auto gather_map = rmm::device_uvector<offset_type>(input.size(), stream);
+  auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
   thrust::copy(
     rmm::exec_policy(stream), gather_iter, gather_iter + input.size(), gather_map.begin());
 
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 84352e36550..3b6d53f43c4 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -454,7 +454,7 @@ struct agg_specific_empty_output {
 
     if constexpr (op == aggregation::COLLECT_LIST) {
       return cudf::make_lists_column(
-        0, make_empty_column(type_to_id<offset_type>()), empty_like(input), 0, {});
+        0, make_empty_column(type_to_id<size_type>()), empty_like(input), 0, {});
     }
 
     return empty_like(input);
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu
index f7544e81ba5..85dced0efe3 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cu
+++ b/cpp/src/rolling/detail/rolling_collect_list.cu
@@ -140,8 +140,8 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   thrust::tabulate(rmm::exec_policy(stream),
                    new_sizes->mutable_view().template begin<size_type>(),
                    new_sizes->mutable_view().template end<size_type>(),
-                   [d_gather_map  = gather_map.template begin<offset_type>(),
-                    d_old_offsets = offsets.template begin<offset_type>(),
+                   [d_gather_map  = gather_map.template begin<size_type>(),
+                    d_old_offsets = offsets.template begin<size_type>(),
                     input_row_not_null] __device__(auto i) {
                      return thrust::count_if(thrust::seq,
                                              d_gather_map + d_old_offsets[i],
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 899cd8b6c86..ca5c04d1c4f 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -327,12 +327,12 @@ std::unique_ptr<column> expand_to_column(Calculator const& calc,
                                          rmm::cuda_stream_view stream)
 {
   auto window_column = cudf::make_numeric_column(
-    cudf::data_type{type_to_id<offset_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream);
+    cudf::data_type{type_to_id<size_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream);
 
   auto begin = cudf::detail::make_counting_transform_iterator(0, calc);
 
   thrust::copy_n(
-    rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data<offset_type>());
+    rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data<size_type>());
 
   return window_column;
 }
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index fcb0bacad9a..4e248922702 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -63,7 +63,7 @@ struct base_fn {
   character_cases_table_type const* d_case_table;
   special_case_mapping const* d_special_case_mapping;
   column_device_view const d_column;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   base_fn(column_device_view const& d_column)
@@ -111,11 +111,11 @@ struct base_fn {
       return;
     }
 
-    auto& derived     = static_cast<Derived&>(*this);
-    auto const d_str  = d_column.element<string_view>(idx);
-    offset_type bytes = 0;
-    auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    bool capitalize   = true;
+    auto& derived    = static_cast<Derived&>(*this);
+    auto const d_str = d_column.element<string_view>(idx);
+    size_type bytes  = 0;
+    auto d_buffer    = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    bool capitalize  = true;
     for (auto const chr : d_str) {
       auto const info        = get_char_info(d_flags, chr);
       auto const flag        = info.second;
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 29023fbb139..ba8acd23467 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ struct concat_strings_base {
   table_device_view const d_table;
   string_scalar_device_view const d_narep;
   separator_on_nulls separate_nulls;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   /**
@@ -72,7 +72,7 @@ struct concat_strings_base {
     }
 
     char* d_buffer       = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    offset_type bytes    = 0;
+    size_type bytes      = 0;
     bool write_separator = false;
 
     for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) {
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 7c9acbfbc58..eee59e37478 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,13 +53,13 @@ template <class Functor>
 struct compute_size_and_concatenate_fn {
   Functor const func;
   column_device_view const lists_dv;
-  offset_type const* const list_offsets;
+  size_type const* const list_offsets;
   column_device_view const strings_dv;
   string_scalar_device_view const string_narep_dv;
   separator_on_nulls const separate_nulls;
   output_if_empty_list const empty_list_policy;
 
-  offset_type* d_offsets{nullptr};
+  size_type* d_offsets{nullptr};
 
   // If d_chars == nullptr: only compute sizes and validities of the output strings.
   // If d_chars != nullptr: only concatenate strings.
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 4f446c8c1cf..0d04fc74b0c 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -95,7 +95,7 @@ struct from_booleans_fn {
   column_device_view const d_column;
   string_view d_true;
   string_view d_false;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ void operator()(size_type idx) const
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 0a0f197c8a1..863f76b9b98 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -191,7 +191,7 @@ struct from_durations_fn {
   column_device_view d_durations;
   format_item const* d_format_items;
   size_type items_count;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ int8_t format_length(char format_char, duration_component const* const timeparts) const
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index cb061d03e5a..a3336258d3e 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -197,7 +197,7 @@ namespace {
 template <typename DecimalType>
 struct from_fixed_point_fn {
   column_device_view d_decimals;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   /**
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 8728ad06964..bed682aba71 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -129,7 +129,7 @@ void dispatch_hex_to_integers_fn::operator()<bool>(column_device_view const&,
 template <typename IntegerType>
 struct integer_to_hex_fn {
   column_device_view const d_column;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ void byte_to_hex(uint8_t byte, char* hex)
@@ -173,7 +173,7 @@ struct integer_to_hex_fn {
         --byte_index;
       }
     } else {
-      d_offsets[idx] = static_cast<offset_type>(bytes) * 2;  // 2 hex characters per byte
+      d_offsets[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
     }
   }
 };
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 609ced97c26..3aef37914fd 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,7 +134,7 @@ struct format_lists_fn {
       auto const view = get_nested_child(stack_idx);
 
       auto offsets   = view.child(cudf::lists_column_view::offsets_column_index);
-      auto d_offsets = offsets.data<offset_type>() + view.offset();
+      auto d_offsets = offsets.data<size_type>() + view.offset();
 
       // add pending separator
       if (item.separator == item_separator::LIST) {
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 8b6305b68e2..401a04cdc9d 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -197,7 +197,7 @@ __forceinline__ __device__ char escaped_sequence_to_byte(char const* const ptr)
  */
 template <int num_warps_per_threadblock, int char_block_size>
 __global__ void url_decode_char_counter(column_device_view const in_strings,
-                                        offset_type* const out_counts)
+                                        size_type* const out_counts)
 {
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
@@ -221,7 +221,7 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
     auto const in_chars      = in_string.data();
     auto const string_length = in_string.size_bytes();
     int const nblocks        = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
-    offset_type escape_char_count = 0;
+    size_type escape_char_count = 0;
 
     for (int block_idx = 0; block_idx < nblocks; block_idx++) {
       int const string_length_block =
@@ -280,7 +280,7 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
 template <int num_warps_per_threadblock, int char_block_size>
 __global__ void url_decode_char_replacer(column_device_view const in_strings,
                                          char* const out_chars,
-                                         offset_type const* const out_offsets)
+                                         size_type const* const out_offsets)
 {
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2];
@@ -393,18 +393,17 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
   auto offsets_mutable_view = offsets_column->mutable_view();
   url_decode_char_counter<num_warps_per_threadblock, char_block_size>
     <<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
-      *d_strings, offsets_mutable_view.begin<offset_type>());
+      *d_strings, offsets_mutable_view.begin<size_type>());
 
   // use scan to transform number of bytes into offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets_view.begin<offset_type>(),
-                         offsets_view.end<offset_type>(),
-                         offsets_mutable_view.begin<offset_type>());
+                         offsets_view.begin<size_type>(),
+                         offsets_view.end<size_type>(),
+                         offsets_mutable_view.begin<size_type>());
 
   // copy the total number of characters of all strings combined (last element of the offset column)
   // to the host memory
-  auto out_chars_bytes =
-    cudf::detail::get_value<offset_type>(offsets_view, offset_count - 1, stream);
+  auto out_chars_bytes = cudf::detail::get_value<size_type>(offsets_view, offset_count - 1, stream);
 
   // create the chars column
   auto chars_column = create_chars_child_column(out_chars_bytes, stream, mr);
@@ -413,7 +412,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
   // decode and copy the characters from the input column to the output column
   url_decode_char_replacer<num_warps_per_threadblock, char_block_size>
     <<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
-      *d_strings, d_out_chars, offsets_column->view().begin<offset_type>());
+      *d_strings, d_out_chars, offsets_column->view().begin<size_type>());
 
   // copy null mask
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index c5dfd4a8b93..287910c9a6f 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -287,12 +287,12 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
         column_view chars_child   = column->child(strings_column_view::chars_column_index);
 
         auto bytes_offset =
-          cudf::detail::get_value<offset_type>(offsets_child, column_offset, stream);
+          cudf::detail::get_value<size_type>(offsets_child, column_offset, stream);
 
         // copy the chars column data
         auto d_chars = chars_child.data<char>() + bytes_offset;
         auto const bytes =
-          cudf::detail::get_value<offset_type>(offsets_child, column_size + column_offset, stream) -
+          cudf::detail::get_value<size_type>(offsets_child, column_size + column_offset, stream) -
           bytes_offset;
 
         CUDF_CUDA_TRY(
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index bdcf01bd336..5f8fc483a34 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,15 +36,15 @@ struct adjust_offsets_fn {
   string_view const d_filler;
   size_type const offset;
 
-  __device__ offset_type operator()(size_type idx)
+  __device__ size_type operator()(size_type idx)
   {
     if (offset < 0) {
-      auto const first      = d_column.element<offset_type>(-offset);
+      auto const first      = d_column.element<size_type>(-offset);
       auto const last_index = d_column.size() + offset;
       if (idx < last_index) {
-        return d_column.element<offset_type>(idx - offset) - first;
+        return d_column.element<size_type>(idx - offset) - first;
       } else {
-        auto const last = d_column.element<offset_type>(d_column.size() - 1);
+        auto const last = d_column.element<size_type>(d_column.size() - 1);
         return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes());
       }
     } else {
@@ -52,7 +52,7 @@ struct adjust_offsets_fn {
         return idx * d_filler.size_bytes();
       } else {
         auto const total_filler = d_filler.size_bytes() * offset;
-        return total_filler + d_column.element<offset_type>(idx - offset);
+        return total_filler + d_column.element<size_type>(idx - offset);
       }
     }
   }
@@ -112,19 +112,19 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   thrust::transform(rmm::exec_policy(stream),
                     thrust::counting_iterator<size_type>(0),
                     thrust::counting_iterator<size_type>(offsets_size),
-                    d_offsets->data<offset_type>(),
+                    d_offsets->data<size_type>(),
                     adjust_offsets_fn{*d_input_offsets, d_fill_str, offset});
 
   // compute the shift-offset for the output characters child column
   auto const shift_offset = [&] {
     auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset;
     return (offset < 0 ? -1 : 1) *
-           cudf::detail::get_value<offset_type>(offsets_column->view(), index, stream);
+           cudf::detail::get_value<size_type>(offsets_column->view(), index, stream);
   }();
 
   // create output chars child column
   auto const chars_size =
-    cudf::detail::get_value<offset_type>(offsets_column->view(), offsets_size - 1, stream);
+    cudf::detail::get_value<size_type>(offsets_column->view(), offsets_size - 1, stream);
   auto chars_column = create_chars_child_column(chars_size, stream, mr);
   auto d_chars      = mutable_column_device_view::create(chars_column->mutable_view(), stream);
   auto const d_input_chars = column_device_view::create(input.chars(), stream);
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index fcd05ee9dc6..8a2f8f0cbfc 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -50,7 +50,7 @@ namespace {
  */
 struct extract_fn {
   column_device_view const d_strings;
-  offset_type const* d_offsets;
+  size_type const* d_offsets;
   string_index_pair* d_indices;
 
   __device__ void operator()(size_type const idx,
@@ -119,7 +119,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
   // Get the match counts for each string.
   // This column will become the output lists child offsets column.
   auto offsets   = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<offset_type>();
+  auto d_offsets = offsets->mutable_view().data<size_type>();
 
   // Compute null output rows
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -138,10 +138,10 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
     d_offsets + strings_count + 1,
     d_offsets,
     [groups] __device__(auto v) { return v * groups; },
-    offset_type{0},
+    size_type{0},
     thrust::plus{});
   auto const total_groups =
-    cudf::detail::get_value<offset_type>(offsets->view(), strings_count, stream);
+    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
 
   rmm::device_uvector<string_index_pair> indices(total_groups, stream);
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index f4dfafeb51f..be5b089c6e0 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -901,7 +901,7 @@ template <int block_size>
 __launch_bounds__(block_size) __global__
   void get_json_object_kernel(column_device_view col,
                               path_operator const* const commands,
-                              offset_type* output_offsets,
+                              size_type* output_offsets,
                               thrust::optional<char*> out_buf,
                               thrust::optional<bitmask_type*> out_validity,
                               thrust::optional<size_type*> out_valid_count,
@@ -932,7 +932,7 @@ __launch_bounds__(block_size) __global__
 
     // filled in only during the precompute step. during the compute step, the offsets
     // are fed back in so we do -not- want to write them out
-    if (!out_buf.has_value()) { output_offsets[tid] = static_cast<offset_type>(output_size); }
+    if (!out_buf.has_value()) { output_offsets[tid] = static_cast<size_type>(output_size); }
 
     // validity filled in only during the output step
     if (out_validity.has_value()) {
@@ -995,7 +995,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *cdv,
       std::get<0>(preprocess).value().data(),
-      offsets_view.head<offset_type>(),
+      offsets_view.head<size_type>(),
       thrust::nullopt,
       thrust::nullopt,
       thrust::nullopt,
@@ -1003,12 +1003,12 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 
   // convert sizes to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets_view.head<offset_type>(),
-                         offsets_view.head<offset_type>() + col.size() + 1,
-                         offsets_view.head<offset_type>(),
+                         offsets_view.head<size_type>(),
+                         offsets_view.head<size_type>() + col.size() + 1,
+                         offsets_view.head<size_type>(),
                          0);
   size_type const output_size =
-    cudf::detail::get_value<offset_type>(offsets_view, col.size(), stream);
+    cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
 
   // allocate output string column
   auto chars = create_chars_child_column(output_size, stream, mr);
@@ -1026,7 +1026,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *cdv,
       std::get<0>(preprocess).value().data(),
-      offsets_view.head<offset_type>(),
+      offsets_view.head<size_type>(),
       chars_view.head<char>(),
       static_cast<bitmask_type*>(validity.data()),
       d_valid_count.data(),
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index da6d01c92dc..c501a8bf7b4 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ struct base_fn {
   column_device_view const d_column;
   size_type const width;
   size_type const fill_char_size;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size)
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 4e0294f188c..396e1e6a2ac 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -84,10 +84,10 @@ auto generate_empty_output(strings_column_view const& input,
   auto chars_column = create_chars_child_column(0, stream, mr);
 
   auto offsets_column = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<offset_type>(),
+    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<size_type>(),
                                 0,
-                                offsets_column->size() * sizeof(offset_type),
+                                offsets_column->size() * sizeof(size_type),
                                 stream.value()));
 
   return make_strings_column(strings_count,
@@ -109,7 +109,7 @@ struct compute_size_and_repeat_fn {
   size_type const repeat_times;
   bool const has_nulls;
 
-  offset_type* d_offsets{nullptr};
+  size_type* d_offsets{nullptr};
 
   // If d_chars == nullptr: only compute sizes of the output strings.
   // If d_chars != nullptr: only repeat strings.
@@ -184,7 +184,7 @@ struct compute_sizes_and_repeat_fn {
   bool const strings_has_nulls;
   bool const rtimes_has_nulls;
 
-  offset_type* d_offsets{nullptr};
+  size_type* d_offsets{nullptr};
 
   // If d_chars == nullptr: only compute sizes of the output strings.
   // If d_chars != nullptr: only repeat strings.
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index 3c1fae7a00f..090705ac25d 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace {
  */
 struct reverse_characters_fn {
   column_device_view const d_strings;
-  offset_type const* d_offsets;
+  size_type const* d_offsets;
   char* d_chars;
 
   __device__ void operator()(size_type idx)
@@ -64,7 +64,7 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
   // copy the column; replace data in the chars column
   auto result = std::make_unique<column>(input.parent(), stream, mr);
   auto const d_offsets =
-    result->view().child(strings_column_view::offsets_column_index).data<offset_type>();
+    result->view().child(strings_column_view::offsets_column_index).data<size_type>();
   auto d_chars = result->mutable_view().child(strings_column_view::chars_column_index).data<char>();
 
   auto const d_column = column_device_view::create(input.parent(), stream);
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index 1907c0d749b..4a823ad1dcb 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,8 +70,8 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
   results->set_null_count(0);
 
   auto offsets = cudf::detail::sequence(strings_count + 1,
-                                        numeric_scalar<offset_type>(0),
-                                        numeric_scalar<offset_type>(targets_count),
+                                        numeric_scalar<size_type>(0),
+                                        numeric_scalar<size_type>(targets_count),
                                         stream,
                                         mr);
   return make_lists_column(strings_count,
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 596fbb39d15..2df64c6a0a7 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -50,7 +50,7 @@ namespace {
  */
 struct findall_fn {
   column_device_view const d_strings;
-  offset_type const* d_offsets;
+  size_type const* d_offsets;
   string_index_pair* d_indices;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
@@ -78,7 +78,7 @@ struct findall_fn {
 std::unique_ptr<column> findall_util(column_device_view const& d_strings,
                                      reprog_device& d_prog,
                                      size_type total_matches,
-                                     offset_type const* d_offsets,
+                                     size_type const* d_offsets,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
@@ -106,7 +106,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
 
   // Create lists offsets column
   auto offsets   = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<offset_type>();
+  auto d_offsets = offsets->mutable_view().data<size_type>();
 
   // Convert counts into offsets
   thrust::exclusive_scan(
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 0dc92cf343c..9aeb6b69bdc 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -60,7 +60,7 @@ enum class split_direction {
 struct token_reader_fn {
   column_device_view const d_strings;
   split_direction const direction;
-  offset_type const* d_token_offsets;
+  size_type const* d_token_offsets;
   string_index_pair* d_tokens;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
@@ -143,17 +143,17 @@ rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const&
 
   auto const begin     = thrust::make_counting_iterator<size_type>(0);
   auto const end       = thrust::make_counting_iterator<size_type>(strings_count);
-  auto const d_offsets = offsets.data<offset_type>();
+  auto const d_offsets = offsets.data<size_type>();
 
   // convert match counts to token offsets
   auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) {
     return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1;
   };
   thrust::transform_exclusive_scan(
-    rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<offset_type>{});
+    rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<size_type>{});
 
   // the last offset entry is the total number of tokens to be generated
-  auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
+  auto const total_tokens = cudf::detail::get_value<size_type>(offsets, strings_count, stream);
 
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   if (total_tokens == 0) { return tokens; }
@@ -176,7 +176,7 @@ rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const&
 struct tokens_transform_fn {
   column_device_view const d_strings;
   string_index_pair const* d_tokens;
-  offset_type const* d_token_offsets;
+  size_type const* d_token_offsets;
   size_type const column_index;
 
   __device__ string_index_pair operator()(size_type idx) const
@@ -215,7 +215,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto offsets = count_matches(
     *d_strings, *d_prog, strings_count + 1, stream, rmm::mr::get_current_device_resource());
   auto offsets_view = offsets->mutable_view();
-  auto d_offsets    = offsets_view.data<offset_type>();
+  auto d_offsets    = offsets_view.data<size_type>();
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream);
diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp
index 6de478d3e1e..4b206666d4b 100644
--- a/cpp/src/strings/strings_column_view.cpp
+++ b/cpp/src/strings/strings_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ column_view strings_column_view::offsets() const
 
 strings_column_view::offset_iterator strings_column_view::offsets_begin() const
 {
-  return offsets().begin<offset_type>() + offset();
+  return offsets().begin<size_type>() + offset();
 }
 
 strings_column_view::offset_iterator strings_column_view::offsets_end() const
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 413fb2497c0..ac55fe76db1 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -199,7 +199,7 @@ struct byte_pair_encoding_fn {
     }
 
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::offset_type>(idx);
+                          .element<cudf::size_type>(idx);
     auto const d_indices = d_byte_indices + offset;
 
     // initialize the byte indices for this string;
@@ -304,7 +304,7 @@ struct byte_pair_encoding_fn {
 struct build_encoding_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type const* d_byte_indices;
-  cudf::offset_type const* d_offsets;
+  cudf::size_type const* d_offsets;
   char* d_chars{};
 
   __device__ void operator()(cudf::size_type idx)
@@ -314,7 +314,7 @@ struct build_encoding_fn {
     if (d_str.empty()) { return; }
 
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::offset_type>(idx);
+                          .element<cudf::size_type>(idx);
     auto const d_indices = d_byte_indices + offset;
     auto d_output        = d_chars ? d_chars + d_offsets[idx] : nullptr;
 
@@ -362,12 +362,12 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   auto const d_merges  = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream);
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
+  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                            static_cast<cudf::size_type>(input.size() + 1),
                                            cudf::mask_state::UNALLOCATED,
                                            stream,
                                            rmm::mr::get_current_device_resource());
-  auto d_offsets = offsets->mutable_view().data<cudf::offset_type>();
+  auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
   byte_pair_encoding_fn fn{*d_merges,
                            *d_strings,
@@ -406,14 +406,14 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
  */
 struct edge_of_space_fn {
   cudf::column_device_view const d_strings;
-  __device__ bool operator()(cudf::offset_type offset)
+  __device__ bool operator()(cudf::size_type offset)
   {
     auto const d_chars =
       d_strings.child(cudf::strings_column_view::chars_column_index).data<char>();
     if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; }
 
     auto const offsets   = d_strings.child(cudf::strings_column_view::offsets_column_index);
-    auto const d_offsets = offsets.data<cudf::offset_type>() + d_strings.offset();
+    auto const d_offsets = offsets.data<cudf::size_type>() + d_strings.offset();
     // ignore offsets outside sliced range
     if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; }
 
@@ -452,12 +452,12 @@ std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& inp
   auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space);
 
   // copy space offsets
-  rmm::device_uvector<cudf::offset_type> space_offsets(space_count, stream);
+  rmm::device_uvector<cudf::size_type> space_offsets(space_count, stream);
   thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space);
 
   // create output offsets
   auto result =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                               static_cast<cudf::size_type>(space_count + input.size() + 1),
                               cudf::mask_state::UNALLOCATED,
                               stream,
@@ -469,7 +469,7 @@ std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& inp
                 input.offsets_end(),
                 space_offsets.begin(),
                 space_offsets.end(),
-                result->mutable_view().begin<cudf::offset_type>());
+                result->mutable_view().begin<cudf::size_type>());
 
   return result;
 }
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index dffe035ad35..b39413af98f 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -78,7 +78,7 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
   CUDF_EXPECTS(merges_file.good(), "Could not open " + filename_merges);
 
   std::vector<char> chars{};
-  std::vector<cudf::offset_type> offsets(1, 0);
+  std::vector<cudf::size_type> offsets(1, 0);
 
   std::string line;
   std::getline(merges_file, line);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 1507a8ce7c6..b151b44565d 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -352,10 +352,10 @@ __device__ size_type row_size_functor::operator()<string_view>(column_device_vie
     return 0;
   }
 
-  auto const offsets_size  = sizeof(offset_type) * CHAR_BIT;
+  auto const offsets_size  = sizeof(size_type) * CHAR_BIT;
   auto const validity_size = col.nullable() ? 1 : 0;
   auto const chars_size =
-    (offsets.data<offset_type>()[row_end] - offsets.data<offset_type>()[row_start]) * CHAR_BIT;
+    (offsets.data<size_type>()[row_end] - offsets.data<size_type>()[row_start]) * CHAR_BIT;
   return ((offsets_size + validity_size) * num_rows) + chars_size;
 }
 
@@ -372,7 +372,7 @@ __device__ size_type row_size_functor::operator()<list_view>(column_device_view
 {
   auto const num_rows{span.row_end - span.row_start};
 
-  auto const offsets_size  = sizeof(offset_type) * CHAR_BIT;
+  auto const offsets_size  = sizeof(size_type) * CHAR_BIT;
   auto const validity_size = col.nullable() ? 1 : 0;
   return (offsets_size + validity_size) * num_rows;
 }
@@ -451,10 +451,10 @@ __global__ void compute_row_sizes(device_span<column_device_view const> cols,
     // if this is a list column, update the working span from our offsets
     if (col.type().id() == type_id::LIST && col.size() > 0) {
       column_device_view const& offsets = col.child(lists_column_view::offsets_column_index);
-      auto const base_offset            = offsets.data<offset_type>()[col.offset()];
+      auto const base_offset            = offsets.data<size_type>()[col.offset()];
       cur_span.row_start =
-        offsets.data<offset_type>()[cur_span.row_start + col.offset()] - base_offset;
-      cur_span.row_end = offsets.data<offset_type>()[cur_span.row_end + col.offset()] - base_offset;
+        offsets.data<size_type>()[cur_span.row_start + col.offset()] - base_offset;
+      cur_span.row_end = offsets.data<size_type>()[cur_span.row_end + col.offset()] - base_offset;
     }
 
     last_branch_depth = info[idx].branch_depth_end;
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 3eccb6b2a55..66de4e19b27 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -508,7 +508,7 @@ TYPED_TEST_SUITE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes);
 TYPED_TEST(ListsDictionaryLeafTest, FromNonNested)
 {
   using DCW      = cudf::test::dictionary_column_wrapper<TypeParam>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto s   = cudf::make_list_scalar(DCW({1, 3, -1, 1, 3}, {1, 1, 0, 1, 1}));
   auto col = cudf::make_column_from_scalar(*s, 2);
@@ -524,7 +524,7 @@ TYPED_TEST(ListsDictionaryLeafTest, FromNonNested)
 TYPED_TEST(ListsDictionaryLeafTest, FromNested)
 {
   using DCW      = cudf::test::dictionary_column_wrapper<TypeParam>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1});
   offset_t offsets{0, 3, 3, 6, 6, 10};
@@ -617,7 +617,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNonNested)
 {
   using LCWinner_t = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
   using StringCW   = cudf::test::strings_column_wrapper;
-  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
   using valid_t    = std::vector<cudf::valid_type>;
 
   auto data = this->make_test_structs_column(
@@ -648,7 +648,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNested)
 {
   using LCWinner_t = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
   using StringCW   = cudf::test::strings_column_wrapper;
-  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
   using valid_t    = std::vector<cudf::valid_type>;
   auto leaf        = this->make_test_structs_column(
     {{1, 2}, {0, 1}},
@@ -702,7 +702,7 @@ TEST_F(ListsZeroLengthColumnTest, MixedTypes)
   using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
   using StringCW = cudf::test::strings_column_wrapper;
   using LCW      = cudf::test::lists_column_wrapper<int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
   {
     auto s   = cudf::make_list_scalar(FCW{1, 2, 3});
     auto got = cudf::make_column_from_scalar(*s, 0);
@@ -759,7 +759,7 @@ TEST_F(ListsZeroLengthColumnTest, SuperimposeNulls)
   using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
   using StringCW = cudf::test::strings_column_wrapper;
   using LCW      = cudf::test::lists_column_wrapper<int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto const lists = [&] {
     auto child = this
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 99ef1df1c2b..7701ca1ba56 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -377,7 +377,7 @@ TEST_F(OverflowTest, OverflowTest)
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
-    auto offsets    = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, size};
+    auto offsets    = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, size};
     auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
     auto col        = cudf::make_strings_column(
       1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
@@ -418,7 +418,7 @@ TEST_F(OverflowTest, OverflowTest)
       cudf::make_structs_column(inner_size, std::move(children), 0, rmm::device_buffer{});
 
     // list
-    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, inner_size};
+    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, inner_size};
     auto col =
       cudf::make_lists_column(1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{});
 
@@ -435,7 +435,7 @@ TEST_F(OverflowTest, OverflowTest)
     constexpr cudf::size_type size = 3;
 
     // list
-    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 0, 0, inner_size};
+    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, inner_size};
     auto many_chars =
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
     auto list_col =
@@ -643,7 +643,7 @@ TEST_F(OverflowTest, Presliced)
     constexpr cudf::size_type list_size = inner_size / num_rows;
 
     // list
-    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{
+    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
       0, list_size, (list_size * 2) - 1, list_size * 3, inner_size};
     auto many_chars =
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp
index ff28156ef1d..579e1bdce8a 100644
--- a/cpp/tests/copying/copy_if_else_nested_tests.cpp
+++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp
@@ -332,7 +332,7 @@ TYPED_TEST(TypedCopyIfElseNestedTest, ListsWithStructs)
   using strings = cudf::test::strings_column_wrapper;
   using structs = cudf::test::structs_column_wrapper;
   using bools   = cudf::test::fixed_width_column_wrapper<bool, int32_t>;
-  using offsets = cudf::test::fixed_width_column_wrapper<cudf::offset_type, int32_t>;
+  using offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type, int32_t>;
 
   auto const null_at_0 = null_at(0);
   auto const null_at_3 = null_at(3);
diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp
index ebfd950df4d..2bc18c706db 100644
--- a/cpp/tests/copying/gather_struct_tests.cpp
+++ b/cpp/tests/copying/gather_struct_tests.cpp
@@ -37,7 +37,7 @@
 #include <memory>
 
 using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
-using gather_map_t      = std::vector<cudf::offset_type>;
+using gather_map_t      = std::vector<cudf::size_type>;
 using offsets           = cudf::test::fixed_width_column_wrapper<int32_t>;
 using structs           = cudf::test::structs_column_wrapper;
 using strings           = cudf::test::strings_column_wrapper;
@@ -54,7 +54,7 @@ using numerics = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 template <typename T>
 using lists = cudf::test::lists_column_wrapper<T, int32_t>;
 
-auto constexpr null_index = std::numeric_limits<cudf::offset_type>::max();
+auto constexpr null_index = std::numeric_limits<cudf::size_type>::max();
 
 struct StructGatherTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index a18ed6a1ccf..d322fbe11f2 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -311,7 +311,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNull)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
   using FCW      = cudf::test::fixed_width_column_wrapper<TypeParam>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   std::vector<cudf::valid_type> valid{1, 0, 1, 0};
   // clang-format off
@@ -466,7 +466,7 @@ TEST_F(ListGetStringValueTest, NestedGetNonNullEmpty)
 TEST_F(ListGetStringValueTest, NestedGetNull)
 {
   using LCW      = cudf::test::lists_column_wrapper<cudf::string_view>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
   using StringCW = cudf::test::strings_column_wrapper;
 
   std::vector<cudf::valid_type> valid{0, 0, 1, 1};
@@ -508,7 +508,7 @@ struct ListGetStructValueTest : public cudf::test::BaseFixture {
    */
   std::unique_ptr<cudf::column> make_test_lists_column(
     cudf::size_type num_lists,
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets,
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets,
     std::unique_ptr<cudf::column> child,
     std::initializer_list<cudf::valid_type> null_mask)
   {
@@ -776,7 +776,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNull)
   // NULL                      <- cudf::get_element(2)
 
   using valid_t  = std::vector<cudf::valid_type>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
   auto list_column_nested =
@@ -900,12 +900,12 @@ TEST_F(StructGetValueTest, multi_level_nested)
   // col fields
   LCW l3({LCW{1, 1, 1}, LCW{2, 2}, LCW{3}}, validity_mask_t{false, true, true}.begin());
   cudf::test::structs_column_wrapper l2{l3};
-  auto l1 = cudf::make_lists_column(
-    1,
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 3}.release(),
-    l2.release(),
-    0,
-    cudf::create_null_mask(1, cudf::mask_state::UNALLOCATED));
+  auto l1 =
+    cudf::make_lists_column(1,
+                            cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 3}.release(),
+                            l2.release(),
+                            0,
+                            cudf::create_null_mask(1, cudf::mask_state::UNALLOCATED));
   std::vector<std::unique_ptr<cudf::column>> l0_fields;
   l0_fields.emplace_back(std::move(l1));
   cudf::test::structs_column_wrapper l0(std::move(l0_fields));
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
index 9dda3c12edf..42d2e004d6b 100644
--- a/cpp/tests/copying/scatter_list_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -307,7 +307,7 @@ TYPED_TEST_SUITE(ScatterListOfStructScalarTest, cudf::test::FixedWidthTypesWitho
 TYPED_TEST(ScatterListOfStructScalarTest, Basic)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto data =
     this->make_test_structs({{42, 42, 42}, {1, 0, 1}},
@@ -346,7 +346,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, Basic)
 TYPED_TEST(ScatterListOfStructScalarTest, EmptyValidScalar)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto data = this->make_test_structs({}, {}, LCW{}, {});
   auto slr  = std::make_unique<cudf::list_scalar>(data, true);
@@ -379,7 +379,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, EmptyValidScalar)
 TYPED_TEST(ScatterListOfStructScalarTest, NullScalar)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto data = this->make_test_structs({}, {}, {}, {});
   auto slr  = std::make_unique<cudf::list_scalar>(data, false);
@@ -411,7 +411,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, NullScalar)
 TYPED_TEST(ScatterListOfStructScalarTest, NullableTargetRow)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto data =
     this->make_test_structs({{42, 42, 42}, {1, 0, 1}},
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index c9a53d6ebe0..da85242410b 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -2072,8 +2072,7 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitList)
 
   // list<struct<float>>
   {
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{
-      0, 2, 5, 7, 10, 12, 14, 17, 20};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 2, 5, 7, 10, 12, 14, 17, 20};
     cudf::test::fixed_width_column_wrapper<float> floats{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
                                                          11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
     cudf::test::structs_column_wrapper data({floats});
@@ -2131,8 +2130,7 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitStructs)
 
   // struct<list<struct>>
   {
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{
-      0, 2, 5, 7, 10, 12, 14, 17, 20};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 2, 5, 7, 10, 12, 14, 17, 20};
     cudf::test::fixed_width_column_wrapper<float> floats{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
                                                          11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
     cudf::test::structs_column_wrapper data({floats});
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index 485febe0d92..749f4013013 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -146,7 +146,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists)
 
   using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
 
-  auto offsets = cudf::data_type{cudf::type_to_id<cudf::offset_type>()};
+  auto offsets = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
 
   cudf::test::fixed_width_column_wrapper<K, int32_t> keys{};
   auto values =
@@ -176,7 +176,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
 
   auto values =
     cudf::make_lists_column(0,
-                            cudf::make_empty_column(cudf::type_to_id<cudf::offset_type>()),
+                            cudf::make_empty_column(cudf::type_to_id<cudf::size_type>()),
                             struct_column.release(),
                             0,
                             {});
@@ -188,13 +188,13 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
 
   auto expect_child =
     cudf::make_lists_column(0,
-                            cudf::make_empty_column(cudf::type_to_id<cudf::offset_type>()),
+                            cudf::make_empty_column(cudf::type_to_id<cudf::size_type>()),
                             expect_struct_column.release(),
                             0,
                             {});
   auto expect_values =
     cudf::make_lists_column(0,
-                            cudf::make_empty_column(cudf::type_to_id<cudf::offset_type>()),
+                            cudf::make_empty_column(cudf::type_to_id<cudf::size_type>()),
                             std::move(expect_child),
                             0,
                             {});
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index a5054daed19..ea2bad0cabf 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -2687,8 +2687,8 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
   constexpr int floats_per_row = 4;
   auto c1_offset_iter          = cudf::detail::make_counting_transform_iterator(
     0, [floats_per_row](cudf::size_type idx) { return idx * floats_per_row; });
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> c1_offsets(
-    c1_offset_iter, c1_offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
+                                                                     c1_offset_iter + num_rows + 1);
   cudf::test::fixed_width_column_wrapper<float> c1_floats(
     values, values + (num_rows * floats_per_row), valids);
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
@@ -2711,8 +2711,8 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
   cudf::test::strings_column_wrapper string_col{string_iter, string_iter + num_string_rows};
   auto offset_iter = cudf::detail::make_counting_transform_iterator(
     0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; });
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets(offset_iter,
-                                                                    offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets(offset_iter,
+                                                                  offset_iter + num_rows + 1);
 
   auto _c3_valids =
     cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; });
@@ -5034,8 +5034,8 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest)
       0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; });
     total_values_produced += (num_rows + 1);
 
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets(offsets_iter,
-                                                                      offsets_iter + num_rows + 1);
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets(offsets_iter,
+                                                                    offsets_iter + num_rows + 1);
     auto c   = cudf::make_lists_column(num_rows, offsets.release(), std::move(prev_col), 0, {});
     prev_col = std::move(c);
   }
diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp
index 2c2b3c8b29c..017cd471e01 100644
--- a/cpp/tests/lists/extract_tests.cpp
+++ b/cpp/tests/lists/extract_tests.cpp
@@ -269,7 +269,7 @@ TYPED_TEST(ListsExtractColumnIndicesTypedTest, ExtractElement)
 {
   using LCW     = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
   using FWCW    = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-  using indices = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using indices = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto input_column = LCW({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}, LCW{}},
                           cudf::test::iterators::null_at(1));
@@ -329,7 +329,7 @@ TYPED_TEST(ListsExtractColumnIndicesTypedTest, ExtractElement)
 TYPED_TEST(ListsExtractColumnIndicesTypedTest, FailureCases)
 {
   using LCW     = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using indices = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using indices = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   {
     // Non-empty input, with mismatched size of indices.
@@ -361,7 +361,7 @@ TEST_F(ListsExtractColumnIndicesTest, ExtractStrings)
 {
   using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
   using strings = cudf::test::strings_column_wrapper;
-  using indices = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using indices = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto input_column = LCW(
     {LCW{"3", "2", "1"}, LCW{}, LCW{"30", "20", "10", "50"}, LCW{"100", "120"}, LCW{"0"}, LCW{}},
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index c6069acad8a..46d4066ddff 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -383,7 +383,7 @@ TEST_F(PercentileApproxTest, EmptyInput)
   cudf::tdigest::tdigest_column_view tdv(*empty);
   auto result = cudf::percentile_approx(tdv, percentiles);
 
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0, 0, 0};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0, 0, 0};
   std::vector<bool> nulls{0, 0, 0};
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(nulls.begin(), nulls.end());
 
@@ -415,7 +415,7 @@ TEST_F(PercentileApproxTest, EmptyPercentiles)
   cudf::tdigest::tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
   auto result = cudf::percentile_approx(tdv, percentiles);
 
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0, 0};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0, 0};
   std::vector<bool> nulls{0, 0};
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(nulls.begin(), nulls.end());
 
diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu
index b0087766c0c..c8fec51e1c9 100644
--- a/cpp/tests/reductions/tdigest_tests.cu
+++ b/cpp/tests/reductions/tdigest_tests.cu
@@ -94,7 +94,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids)
   cudf::test::fixed_width_column_wrapper<double> c0c{1.0, 2.0};
   cudf::test::fixed_width_column_wrapper<double> c0w{100.0, 50.0};
   cudf::test::structs_column_wrapper c0s({c0c, c0w});
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> c0_offsets{0, 2};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c0_offsets{0, 2};
   auto c0l =
     cudf::make_lists_column(1, c0_offsets.release(), c0s.release(), 0, rmm::device_buffer{});
   cudf::test::fixed_width_column_wrapper<double> c0min{1.0};
@@ -111,7 +111,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids)
   cudf::test::fixed_width_column_wrapper<double> c1c{3.0, 4.0};
   cudf::test::fixed_width_column_wrapper<double> c1w{200.0, 50.0};
   cudf::test::structs_column_wrapper c1s({c1c, c1w});
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> c1_offsets{0, 2};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets{0, 2};
   auto c1l =
     cudf::make_lists_column(1, c1_offsets.release(), c1s.release(), 0, rmm::device_buffer{});
   cudf::test::fixed_width_column_wrapper<double> c1min{3.0};
@@ -147,7 +147,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids)
   cudf::test::fixed_width_column_wrapper<double> ec{1.0, 2.0, 3.0, 4.0};
   cudf::test::fixed_width_column_wrapper<double> ew{100.0, 50.0, 200.0, 50.0};
   cudf::test::structs_column_wrapper es({ec, ew});
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> e_offsets{0, 4};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> e_offsets{0, 4};
   auto el = cudf::make_lists_column(1, e_offsets.release(), es.release(), 0, rmm::device_buffer{});
   cudf::test::fixed_width_column_wrapper<double> emin{1.0};
   cudf::test::fixed_width_column_wrapper<double> emax{4.0};
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index e8e603f8533..ecc38dfd26e 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -152,7 +152,7 @@ TEST_F(StringsColumnTest, GatherTooBig)
 {
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets({0, 3000000});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets({0, 3000000});
   auto input = cudf::column_view(
     cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars});
   auto map = thrust::constant_iterator<int8_t>(0);
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 868785b4612..0cb5023a32e 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -294,9 +294,9 @@ TEST_F(StringsContainsTests, HexTest)
   std::vector<char> ascii_chars(  // all possible matchable chars
     {thrust::make_counting_iterator<char>(0), thrust::make_counting_iterator<char>(127)});
   auto const count = static_cast<cudf::size_type>(ascii_chars.size());
-  std::vector<cudf::offset_type> offsets(
-    {thrust::make_counting_iterator<cudf::offset_type>(0),
-     thrust::make_counting_iterator<cudf::offset_type>(0) + count + 1});
+  std::vector<cudf::size_type> offsets(
+    {thrust::make_counting_iterator<cudf::size_type>(0),
+     thrust::make_counting_iterator<cudf::size_type>(0) + count + 1});
   auto d_chars = cudf::detail::make_device_uvector_sync(
     ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto d_offsets = cudf::detail::make_device_uvector_sync(
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index d8594fa4923..a3d392cfed0 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -99,8 +99,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     cudf::device_span<char const>(strings_view.chars().data<char>(), strings_view.chars().size()),
     cudf::get_default_stream());
   auto h_offsets_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<cudf::offset_type const>(
-      strings_view.offsets().data<cudf::offset_type>() + strings_view.offset(),
+    cudf::device_span<cudf::size_type const>(
+      strings_view.offsets().data<cudf::size_type>() + strings_view.offset(),
       strings_view.size() + 1),
     cudf::get_default_stream());
   EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
@@ -164,8 +164,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
     cudf::device_span<char const>(strings_view.chars().data<char>(), strings_view.chars().size()),
     cudf::get_default_stream());
   auto h_offsets_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<cudf::offset_type const>(
-      strings_view.offsets().data<cudf::offset_type>() + strings_view.offset(),
+    cudf::device_span<cudf::size_type const>(
+      strings_view.offsets().data<cudf::size_type>() + strings_view.offset(),
       strings_view.size() + 1),
     cudf::get_default_stream());
   EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 095495456e9..4832cdf816f 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -98,10 +98,10 @@ std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>> build_li
   // }
   cudf::test::fixed_width_column_wrapper<T> values{
     1, 2, 3, 4, 5, 10, 6, 7, 8, 9, -1, -2, -3, -4, -5, -6, -7, -8, -9};
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> inner_offsets{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> inner_offsets{
     0, 2, 5, 6, 9, 10, 12, 14, 17, 19};
   auto inner_list = cudf::make_lists_column(9, inner_offsets.release(), values.release(), 0, {});
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> outer_offsets{0, 2, 2, 3, 5, 7, 9};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> outer_offsets{0, 2, 2, 3, 5, 7, 9};
   auto list = cudf::make_lists_column(6, outer_offsets.release(), std::move(inner_list), 0, {});
 
   // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf
@@ -142,13 +142,13 @@ TYPED_TEST(RowBitCountTyped, ListsWithNulls)
   // }
   cudf::test::fixed_width_column_wrapper<T> values{{1, 2, 3, 4, 5, 10, 6, 7, 8},
                                                    {1, 1, 1, 0, 1, 1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> inner_offsets{0, 2, 5, 6, 9, 9};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> inner_offsets{0, 2, 5, 6, 9, 9};
   std::vector<bool> inner_list_validity{1, 1, 1, 1, 0};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(inner_list_validity.begin(), inner_list_validity.end());
   auto inner_list = cudf::make_lists_column(
     5, inner_offsets.release(), values.release(), null_count, std::move(null_mask));
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> outer_offsets{0, 2, 2, 3, 5};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> outer_offsets{0, 2, 2, 3, 5};
   auto list = cudf::make_lists_column(4, outer_offsets.release(), std::move(inner_list), 0, {});
 
   cudf::table_view t({*list});
@@ -177,7 +177,7 @@ TEST_F(RowBitCount, Strings)
 
   // expect 1 offset (4 bytes) + length of string per row
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
-    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::offset_type)) * CHAR_BIT;
+    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT;
   });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(size_iter,
                                                                    size_iter + strings.size());
@@ -200,7 +200,7 @@ TEST_F(RowBitCount, StringsWithNulls)
   // expect 1 offset (4 bytes) + (length of string, or 0 if null) + 1 validity bit per row
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings, &valids](int i) {
     return ((static_cast<cudf::size_type>(valids[i] ? strings[i].size() : 0) +
-             sizeof(cudf::offset_type)) *
+             sizeof(cudf::size_type)) *
             CHAR_BIT) +
            1;
   });
@@ -247,8 +247,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
     cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1);
   auto list_offsets_view = list_offsets->mutable_view();
   thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
-                   list_offsets_view.begin<cudf::offset_type>(),
-                   list_offsets_view.end<cudf::offset_type>(),
+                   list_offsets_view.begin<cudf::size_type>(),
+                   list_offsets_view.end<cudf::size_type>(),
                    times_2{});
 
   // List<int32_t> = {{0,1}, {2,3}, {4,5}, ..., {2*(num_rows-1), 2*num_rows-1}};
@@ -267,7 +267,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   thrust::fill_n(rmm::exec_policy(cudf::get_default_stream()),
                  expected_row_bit_counts->mutable_view().begin<int32_t>(),
                  num_rows,
-                 CHAR_BIT * (2 * sizeof(int32_t) + sizeof(cudf::offset_type)));
+                 CHAR_BIT * (2 * sizeof(int32_t) + sizeof(cudf::size_type)));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(row_bit_counts->view(), expected_row_bit_counts->view());
 }
@@ -309,8 +309,7 @@ TEST_F(RowBitCount, StructsNoNulls)
   // expect 1 offset (4 bytes) + (length of string) + 1 float + 1 int16_t
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
     return ((sizeof(float) + sizeof(int16_t)) * CHAR_BIT) +
-           ((static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::offset_type)) *
-            CHAR_BIT);
+           ((static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT);
   });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(size_iter,
                                                                    size_iter + t.num_rows());
@@ -534,7 +533,7 @@ TEST_F(RowBitCount, NestedTypes)
 
 TEST_F(RowBitCount, NullsInStringsList)
 {
-  using offsets_wrapper = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offsets_wrapper = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   // clang-format off
   auto strings = std::vector<std::string>{ "daïs", "def", "", "z", "bananas", "warp", "", "zing" };
@@ -552,7 +551,7 @@ TEST_F(RowBitCount, NullsInStringsList)
     {});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
     cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(),
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type>{138, 106, 130, 130});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{138, 106, 130, 130});
 }
 
 TEST_F(RowBitCount, EmptyChildColumnInListOfStrings)
@@ -560,13 +559,13 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfStrings)
   // Test with a list<string> column with 4 empty list rows.
   // Note: Since there are no strings in any of the lists,
   //       the lists column's child can be empty.
-  auto offsets   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 0, 0, 0, 0};
+  auto offsets   = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
   auto lists_col = cudf::make_lists_column(
     4, offsets.release(), cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), 0, {});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
     cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(),
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type>{32, 32, 32, 32});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{32, 32, 32, 32});
 }
 
 TEST_F(RowBitCount, EmptyChildColumnInListOfLists)
@@ -579,12 +578,12 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists)
     return cudf::empty_like(exemplar);
   };
 
-  auto offsets   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 0, 0, 0, 0};
+  auto offsets   = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
   auto lists_col = cudf::make_lists_column(4, offsets.release(), empty_child_lists_column(), 0, {});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
     cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(),
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type>{32, 32, 32, 32});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{32, 32, 32, 32});
 }
 
 struct sum_functor {
@@ -639,12 +638,12 @@ TEST_F(RowBitCount, DepthJump)
   // the jump occurs from depth 2 (the leafmost int column)
   // to depth 0 (the topmost int column)
   cudf::test::fixed_width_column_wrapper<T> ____c0{1, 2, 3, 5, 5, 6, 7, 8};
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> ___offsets{0, 2, 4, 6, 8};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> ___offsets{0, 2, 4, 6, 8};
   auto ___c0 = cudf::make_lists_column(4, ___offsets.release(), ____c0.release(), 0, {});
   std::vector<std::unique_ptr<cudf::column>> __children;
   __children.push_back(std::move(___c0));
   cudf::test::structs_column_wrapper __c0(std::move(__children));
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> _offsets{0, 3, 4};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> _offsets{0, 3, 4};
   auto _c0 = cudf::make_lists_column(2, _offsets.release(), __c0.release(), 0, {});
   cudf::test::fixed_width_column_wrapper<int> _c1{3, 4};
   std::vector<std::unique_ptr<cudf::column>> children;
@@ -657,7 +656,7 @@ TEST_F(RowBitCount, DepthJump)
 
   // expected size = (num rows at level 1 + num_rows at level 2) + (# values the leaf int column) +
   // 1 (value in topmost int column)
-  constexpr cudf::size_type offset_size = sizeof(cudf::offset_type) * CHAR_BIT;
+  constexpr cudf::size_type offset_size = sizeof(cudf::size_type) * CHAR_BIT;
   constexpr cudf::size_type type_size   = sizeof(T) * CHAR_BIT;
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected{
     ((1 + 3) * offset_size) + (6 * type_size) + (1 * type_size),
@@ -693,7 +692,7 @@ TEST_F(RowBitCount, SlicedColumnsStrings)
 
   // expect 1 offset (4 bytes) + length of string per row
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
-    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::offset_type)) * CHAR_BIT;
+    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT;
   });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(size_iter + 3,
                                                                    size_iter + 3 + slice_size);
@@ -736,7 +735,7 @@ TEST_F(RowBitCount, SlicedColumnsStructs)
 
   // expect 1 offset (4 bytes) + length of string per row + 1 int16_t per row
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
-    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::offset_type) +
+    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::size_type) +
             sizeof(int16_t)) *
            CHAR_BIT;
   });
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 3e0545935ef..fcaf23fd456 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -123,7 +123,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
     0,
     [row_indices = row_indices.begin<size_type>(),
      validity    = c.null_mask(),
-     offsets     = c.offsets().begin<offset_type>(),
+     offsets     = c.offsets().begin<size_type>(),
      offset      = c.offset()] __device__(int index) {
       // both null mask and offsets data are not pre-sliced. so we need to add the column offset to
       // every incoming index.
@@ -168,9 +168,9 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   auto output_row_iter = cudf::detail::make_counting_transform_iterator(
     0,
     [row_indices  = row_indices.begin<size_type>(),
-     offsets      = c.offsets().begin<offset_type>(),
+     offsets      = c.offsets().begin<size_type>(),
      offset       = c.offset(),
-     first_offset = cudf::detail::get_value<offset_type>(
+     first_offset = cudf::detail::get_value<size_type>(
        c.offsets(), c.offset(), cudf::test::get_default_stream())] __device__(int index) {
       auto const true_index = row_indices[index] + offset;
       return offsets[true_index] - first_offset;
diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
index d2e95812894..9294aa0f681 100644
--- a/cpp/tests/utilities/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -110,12 +110,12 @@ std::unique_ptr<column> make_expected_tdigest_column(std::vector<expected_tdiges
     auto tdigests =
       cudf::make_structs_column(tdigest.mean.size(), std::move(inner_children), 0, {});
 
-    std::vector<offset_type> h_offsets{0, tdigest.mean.size()};
+    std::vector<size_type> h_offsets{0, tdigest.mean.size()};
     auto offsets =
       cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED);
-    CUDF_CUDA_TRY(cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
+    CUDF_CUDA_TRY(cudaMemcpy(offsets->mutable_view().begin<size_type>(),
                              h_offsets.data(),
-                             sizeof(offset_type) * 2,
+                             sizeof(size_type) * 2,
                              cudaMemcpyDefault));
 
     auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {});
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 6cdcdd22dd7..e90a3f9ac6e 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -385,7 +385,7 @@ TEST_F(ColumnUtilitiesListsTest, UnsanitaryLists)
   //    0, 1, 2
   std::vector<std::unique_ptr<cudf::column>> children;
   children.emplace_back(
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 3}.release()));
+    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 3}.release()));
   children.emplace_back(std::move(cudf::test::fixed_width_column_wrapper<int>{0, 1, 2}.release()));
 
   auto l0 = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::LIST},
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 580f63d73b2..56aea0b45e2 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -208,10 +208,10 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
       cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
 
   // Assemble a lists column of structs<out_keys, out_vals>.
-  auto out_offsets = make_numeric_column(data_type{type_to_id<offset_type>()}, input.size() + 1,
+  auto out_offsets = make_numeric_column(data_type{type_to_id<size_type>()}, input.size() + 1,
                                          mask_state::UNALLOCATED, stream);
-  auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>();
-  auto const labels_begin = out_labels.template begin<offset_type>();
+  auto const offsets_begin = out_offsets->mutable_view().template begin<size_type>();
+  auto const labels_begin = out_labels.template begin<size_type>();
   cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
                                   offsets_begin + out_offsets->size(), stream);
 
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index a0dbfb3b38c..d93d38c7758 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -179,9 +179,9 @@ struct tile_info {
  *
  */
 struct row_batch {
-  size_type num_bytes;                     // number of bytes in this batch
-  size_type row_count;                     // number of rows in the batch
-  device_uvector<offset_type> row_offsets; // offsets column of output cudf column
+  size_type num_bytes;                   // number of bytes in this batch
+  size_type row_count;                   // number of rows in the batch
+  device_uvector<size_type> row_offsets; // offsets column of output cudf column
 };
 
 /**
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index ee871f06231..11480d774ef 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -5,7 +5,6 @@ from libc.stdint cimport int32_t, uint32_t
 
 cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
     ctypedef int32_t size_type
-    ctypedef int32_t offset_type
     ctypedef uint32_t bitmask_type
     ctypedef uint32_t char_utf8
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index 740db51db6c..2b08e6863a1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -5,7 +5,7 @@ from libcpp.vector cimport vector
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport bitmask_type, offset_type, size_type
+from cudf._lib.cpp.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
@@ -20,7 +20,7 @@ cdef class Column:
         gpumemoryview data
         gpumemoryview mask
         size_type null_count
-        offset_type offset
+        size_type offset
         # children: List[Column]
         list children
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index 35c30b280c9..be4eff4c49d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -6,7 +6,7 @@ from libcpp.utility cimport move
 from rmm._lib.device_buffer cimport DeviceBuffer
 
 from cudf._lib.cpp.column.column cimport column, column_contents
-from cudf._lib.cpp.types cimport offset_type, size_type
+from cudf._lib.cpp.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
@@ -42,7 +42,7 @@ cdef class Column:
     """
     def __init__(
         self, DataType data_type not None, size_type size, gpumemoryview data,
-        gpumemoryview mask, size_type null_count, offset_type offset,
+        gpumemoryview mask, size_type null_count, size_type offset,
         list children
     ):
         self.data_type = data_type

From b7994bc16b1b1743b0743860b4f02ac4da8245d5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 3 Aug 2023 07:54:33 -1000
Subject: [PATCH 025/230] Raise NotImplementedError for pd.SparseDtype (#13798)

Currently cuDF seems to cast `pd.SparseDtype` to it's subtype instead of maintaining the sparse data type from pandas. Since `pd.SparseDtype` is not supported in cuDF, it is better to raise and tell users to cast directly to the sparse subtype

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13798
---
 python/cudf/cudf/core/column/column.py   | 10 ++++++++++
 python/cudf/cudf/tests/test_dataframe.py |  6 ++++++
 python/cudf/cudf/tests/test_series.py    |  6 ++++++
 3 files changed, 22 insertions(+)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b4f3f533d44..da3d04c15c0 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2010,6 +2010,11 @@ def as_column(
                 return as_column(arbitrary.array)
             elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype):
                 return as_column(pa.array(arbitrary.array, from_pandas=True))
+            elif isinstance(arbitrary.dtype, pd.SparseDtype):
+                raise NotImplementedError(
+                    f"{arbitrary.dtype} is not supported. Convert first to "
+                    f"{arbitrary.dtype.subtype}."
+                )
         if is_categorical_dtype(arbitrary):
             data = as_column(pa.array(arbitrary, from_pandas=True))
         elif is_interval_dtype(arbitrary.dtype):
@@ -2214,6 +2219,11 @@ def as_column(
             )
         if dtype is not None:
             data = data.astype(dtype)
+    elif isinstance(arbitrary, pd.arrays.SparseArray):
+        raise NotImplementedError(
+            f"{arbitrary.dtype} is not supported. Convert first to "
+            f"{arbitrary.dtype.subtype}."
+        )
     elif isinstance(arbitrary, memoryview):
         data = as_column(
             np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e35ab147bf4..0898cb2ef3d 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10245,6 +10245,12 @@ def test_dataframe_init_columns_named_index():
     assert_eq(gdf, pdf)
 
 
+def test_dataframe_from_pandas_sparse():
+    pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0))
+    with pytest.raises(NotImplementedError):
+        cudf.DataFrame(pdf)
+
+
 def test_dataframe_constructor_unbounded_sequence():
     class A:
         def __getitem__(self, key):
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 83d22bbca2d..58eaebae925 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2206,6 +2206,12 @@ def test_series_contains(data, index):
     assert_eq(False in ps, False in gs)
 
 
+def test_series_from_pandas_sparse():
+    pser = pd.Series(range(2), dtype=pd.SparseDtype(np.int64, 0))
+    with pytest.raises(NotImplementedError):
+        cudf.Series(pser)
+
+
 def test_series_constructor_unbounded_sequence():
     class A:
         def __getitem__(self, key):

From dcc8d9195d62c64f0c132d0b67f9abf17207305e Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 3 Aug 2023 13:43:22 -0700
Subject: [PATCH 026/230] Restructure JSON code to correctly reflect
 legacy/experimental status (#13757)

Closes https://github.com/rapidsai/cudf/issues/11982

Moved files to correct directories.
Updated namespaces: experimental -> base, base -> legacy
Improved namespace nesting to reduce the need for fully qualified names (`json::detail` instead of `detail::json`).
Use `host_span` instead of `std::vector &` in all `read_json` variants.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13757
---
 cpp/CMakeLists.txt                            |  8 ++--
 cpp/include/cudf/io/detail/data_casting.cuh   |  4 +-
 cpp/include/cudf/io/detail/json.hpp           |  2 +-
 .../{experimental => }/byte_range_info.cu     |  6 +--
 cpp/src/io/json/json_column.cu                | 38 +++++++--------
 cpp/src/io/json/{ => legacy}/json_gpu.cu      | 17 ++-----
 cpp/src/io/json/{ => legacy}/json_gpu.hpp     | 12 ++---
 cpp/src/io/json/legacy/read_json.hpp          | 33 +++++++++++++
 cpp/src/io/json/{ => legacy}/reader_impl.cu   | 26 ++++------
 cpp/src/io/json/nested_json_gpu.cu            | 16 +++----
 .../io/json/{experimental => }/read_json.cu   | 14 ++++--
 .../io/json/{experimental => }/read_json.hpp  |  6 +--
 cpp/src/io/json/write_json.cu                 |  2 +-
 cpp/tests/io/json_chunked_reader.cpp          |  4 +-
 cpp/tests/io/json_type_cast_test.cu           | 48 +++++++++----------
 15 files changed, 125 insertions(+), 111 deletions(-)
 rename cpp/src/io/json/{experimental => }/byte_range_info.cu (89%)
 rename cpp/src/io/json/{ => legacy}/json_gpu.cu (98%)
 rename cpp/src/io/json/{ => legacy}/json_gpu.hpp (95%)
 create mode 100644 cpp/src/io/json/legacy/read_json.hpp
 rename cpp/src/io/json/{ => legacy}/reader_impl.cu (96%)
 rename cpp/src/io/json/{experimental => }/read_json.cu (96%)
 rename cpp/src/io/json/{experimental => }/read_json.hpp (91%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 37eef74f6ed..d6b2fb10c23 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -370,13 +370,13 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
+  src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
-  src/io/json/json_gpu.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
-  src/io/json/reader_impl.cu
-  src/io/json/experimental/byte_range_info.cu
-  src/io/json/experimental/read_json.cu
+  src/io/json/read_json.cu
+  src/io/json/legacy/json_gpu.cu
+  src/io/json/legacy/reader_impl.cu
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index d764e8533c6..b7ee5e05e96 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -32,7 +32,7 @@
 
 #include <memory>
 
-namespace cudf::io::json::experimental::detail {
+namespace cudf::io::json::detail {
 
 // Unicode code point escape sequence
 static constexpr char UNICODE_SEQ = 0x7F;
@@ -428,4 +428,4 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
   return out_col;
 }
 
-}  // namespace cudf::io::json::experimental::detail
+}  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 7b0350e9bc8..6930a4fdb25 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -33,7 +33,7 @@ namespace cudf::io::json::detail {
  *
  * @return cudf::table object that contains the array of cudf::column.
  */
-table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& options,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr);
diff --git a/cpp/src/io/json/experimental/byte_range_info.cu b/cpp/src/io/json/byte_range_info.cu
similarity index 89%
rename from cpp/src/io/json/experimental/byte_range_info.cu
rename to cpp/src/io/json/byte_range_info.cu
index d6e30d090a5..d359e917dfa 100644
--- a/cpp/src/io/json/experimental/byte_range_info.cu
+++ b/cpp/src/io/json/byte_range_info.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/find.h>
 
-namespace cudf::io::detail::json::experimental {
+namespace cudf::io::json::detail {
 
 // Extract the first character position in the string.
 size_type find_first_delimiter(device_span<char const> d_data,
@@ -33,4 +33,4 @@ size_type find_first_delimiter(device_span<char const> d_data,
   return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
 }
 
-}  // namespace cudf::io::detail::json::experimental
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index b18637c86d7..0cd8edaf78c 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -52,8 +52,7 @@
 #include <algorithm>
 #include <cstdint>
 
-namespace cudf::io::json {
-namespace detail {
+namespace cudf::io::json::detail {
 
 // DEBUG prints
 auto to_cat = [](auto v) -> std::string {
@@ -348,14 +347,14 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
   cudf::io::parse_options_view options_view{};
   options_view.quotechar  = '\0';  // no quotes
   options_view.keepquotes = true;
-  auto d_column_names     = experimental::detail::parse_data(string_views.begin(),
-                                                         num_strings,
-                                                         data_type{type_id::STRING},
-                                                         rmm::device_buffer{},
-                                                         0,
-                                                         options_view,
-                                                         stream,
-                                                         rmm::mr::get_current_device_resource());
+  auto d_column_names     = parse_data(string_views.begin(),
+                                   num_strings,
+                                   data_type{type_id::STRING},
+                                   rmm::device_buffer{},
+                                   0,
+                                   options_view,
+                                   stream,
+                                   rmm::mr::get_current_device_resource());
   auto to_host            = [](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
@@ -796,14 +795,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
 
       auto [result_bitmask, null_count] = make_validity(json_col);
       // Convert strings to the inferred data type
-      auto col = experimental::detail::parse_data(string_spans_it,
-                                                  col_size,
-                                                  target_type,
-                                                  std::move(result_bitmask),
-                                                  null_count,
-                                                  options.view(),
-                                                  stream,
-                                                  mr);
+      auto col = parse_data(string_spans_it,
+                            col_size,
+                            target_type,
+                            std::move(result_bitmask),
+                            null_count,
+                            options.view(),
+                            stream,
+                            mr);
 
       // Reset nullable if we do not have nulls
       // This is to match the existing JSON reader's behaviour:
@@ -1044,5 +1043,4 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
 }
 
-}  // namespace detail
-}  // namespace cudf::io::json
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
similarity index 98%
rename from cpp/src/io/json/json_gpu.cu
rename to cpp/src/io/json/legacy/json_gpu.cu
index 167ae332ac7..d28d5614591 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/legacy/json_gpu.cu
@@ -45,11 +45,7 @@
 
 using cudf::device_span;
 
-namespace cudf {
-namespace io {
-namespace json {
-namespace gpu {
-using namespace ::cudf;
+namespace cudf::io::json::detail::legacy {
 
 namespace {
 /**
@@ -515,7 +511,7 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
 }  // namespace
 
 /**
- * @copydoc cudf::io::json::gpu::convert_json_to_columns
+ * @copydoc cudf::io::json::detail::legacy::convert_json_to_columns
  */
 void convert_json_to_columns(parse_options_view const& opts,
                              device_span<char const> const data,
@@ -547,7 +543,7 @@ void convert_json_to_columns(parse_options_view const& opts,
 }
 
 /**
- * @copydoc cudf::io::gpu::detect_data_types
+ * @copydoc cudf::io::json::detail::legacy::detect_data_types
  */
 
 std::vector<cudf::io::column_type_histogram> detect_data_types(
@@ -592,7 +588,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
 }
 
 /**
- * @copydoc cudf::io::json::gpu::gpu_collect_keys_info
+ * @copydoc cudf::io::json::detail::legacy::collect_keys_info
  */
 void collect_keys_info(parse_options_view const& options,
                        device_span<char const> const data,
@@ -615,7 +611,4 @@ void collect_keys_info(parse_options_view const& options,
   CUDF_CHECK_CUDA(stream.value());
 }
 
-}  // namespace gpu
-}  // namespace json
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/json_gpu.hpp b/cpp/src/io/json/legacy/json_gpu.hpp
similarity index 95%
rename from cpp/src/io/json/json_gpu.hpp
rename to cpp/src/io/json/legacy/json_gpu.hpp
index 46bc2dd95a3..48fe6c69390 100644
--- a/cpp/src/io/json/json_gpu.hpp
+++ b/cpp/src/io/json/legacy/json_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,10 +31,7 @@
 
 using cudf::device_span;
 
-namespace cudf {
-namespace io {
-namespace json {
-namespace gpu {
+namespace cudf::io::json::detail::legacy {
 
 using col_map_type = concurrent_unordered_map<uint32_t, cudf::size_type>;
 /**
@@ -100,7 +97,4 @@ void collect_keys_info(parse_options_view const& options,
                        thrust::optional<mutable_table_device_view> keys_info,
                        rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace json
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp
new file mode 100644
index 00000000000..e3fa010e08e
--- /dev/null
+++ b/cpp/src/io/json/legacy/read_json.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/mr/memory_resource.h>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::io::json::detail::legacy {
+
+table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
+                              json_reader_options const& reader_opts,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
+}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
similarity index 96%
rename from cpp/src/io/json/reader_impl.cu
rename to cpp/src/io/json/legacy/reader_impl.cu
index c7b46813909..c524c041df7 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/legacy/reader_impl.cu
@@ -16,8 +16,6 @@
 
 #include "json_gpu.hpp"
 
-#include "experimental/read_json.hpp"
-
 #include <hash/concurrent_unordered_map.cuh>
 
 #include <io/comp/io_uncomp.hpp>
@@ -56,9 +54,8 @@
 
 using cudf::host_span;
 
-namespace cudf::io::json::detail {
+namespace cudf::io::json::detail::legacy {
 
-using col_map_type     = cudf::io::json::gpu::col_map_type;
 using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
 
 /**
@@ -129,8 +126,7 @@ std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& par
 {
   // Count keys
   rmm::device_scalar<unsigned long long int> key_counter(0, stream);
-  cudf::io::json::gpu::collect_keys_info(
-    parse_opts, data, row_offsets, key_counter.data(), {}, stream);
+  collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {}, stream);
 
   // Allocate columns to store hash value, length, and offset of each JSON object key in the input
   auto const num_keys = key_counter.value(stream);
@@ -148,8 +144,7 @@ std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& par
   // Reset the key counter - now used for indexing
   key_counter.set_value_to_zero_async(stream);
   // Fill the allocated columns
-  cudf::io::json::gpu::collect_keys_info(
-    parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
+  collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
   return info_table;
 }
 
@@ -213,7 +208,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashe
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
-std::vector<uint8_t> ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                       compression_type compression,
                                       size_t range_offset,
                                       size_t range_size,
@@ -447,7 +442,7 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
     auto const num_columns       = column_names.size();
     auto const do_set_null_count = column_map->capacity() > 0;
 
-    auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
+    auto const h_column_infos = detect_data_types(
       parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream);
 
     auto get_type_id = [&](auto const& cinfo) {
@@ -523,7 +518,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
     num_columns, stream, rmm::mr::get_current_device_resource());
 
-  cudf::io::json::gpu::convert_json_to_columns(
+  convert_json_to_columns(
     parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);
 
   stream.synchronize();
@@ -591,16 +586,11 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
  *
  * @return Table and its metadata
  */
-table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
+table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
-  if (not reader_opts.is_enabled_legacy()) {
-    return cudf::io::detail::json::experimental::read_json(sources, reader_opts, stream, mr);
-  }
-
   CUDF_EXPECTS(not sources.empty(), "No sources were defined");
   CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE,
                "Multiple compressed inputs are not supported");
@@ -664,4 +654,4 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
                                mr);
 }
 
-}  // namespace cudf::io::json::detail
+}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 3b6c2b18250..0629ceb95c6 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1993,14 +1993,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto [result_bitmask, null_count] = make_validity(json_col);
 
       // Convert strings to the inferred data type
-      auto col = experimental::detail::parse_data(string_spans_it,
-                                                  col_size,
-                                                  target_type,
-                                                  std::move(result_bitmask),
-                                                  null_count,
-                                                  parsing_options(options).view(),
-                                                  stream,
-                                                  mr);
+      auto col = parse_data(string_spans_it,
+                            col_size,
+                            target_type,
+                            std::move(result_bitmask),
+                            null_count,
+                            parsing_options(options).view(),
+                            stream,
+                            mr);
 
       // Reset nullable if we do not have nulls
       // This is to match the existing JSON reader's behaviour:
diff --git a/cpp/src/io/json/experimental/read_json.cu b/cpp/src/io/json/read_json.cu
similarity index 96%
rename from cpp/src/io/json/experimental/read_json.cu
rename to cpp/src/io/json/read_json.cu
index dbb4a628c44..080da7800f4 100644
--- a/cpp/src/io/json/experimental/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -17,6 +17,7 @@
 #include "read_json.hpp"
 
 #include <io/comp/io_uncomp.hpp>
+#include <io/json/legacy/read_json.hpp>
 #include <io/json/nested_json.hpp>
 
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -30,7 +31,7 @@
 
 #include <numeric>
 
-namespace cudf::io::detail::json::experimental {
+namespace cudf::io::json::detail {
 
 size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
                     size_t range_offset,
@@ -44,7 +45,7 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
   });
 }
 
-rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> const& sources,
+rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                            compression_type compression,
                                            size_t range_offset,
                                            size_t range_size,
@@ -197,6 +198,11 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
+
+  if (reader_opts.is_enabled_legacy()) {
+    return legacy::read_json(sources, reader_opts, stream, mr);
+  }
+
   if (not should_load_whole_source(reader_opts)) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
                  "Specifying a byte range is supported only for JSON Lines");
@@ -213,8 +219,8 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
 
   auto const buffer = get_record_range_raw_input(sources, reader_opts, stream);
 
-  return cudf::io::json::detail::device_parse_nested_json(buffer, reader_opts, stream, mr);
+  return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }
 
-}  // namespace cudf::io::detail::json::experimental
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/read_json.hpp
similarity index 91%
rename from cpp/src/io/json/experimental/read_json.hpp
rename to cpp/src/io/json/read_json.hpp
index 48e104c4254..db37e7abcdb 100644
--- a/cpp/src/io/json/experimental/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 
 #include <memory>
 
-namespace cudf::io::detail::json::experimental {
+namespace cudf::io::json::detail {
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
@@ -42,4 +42,4 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream);
 
-}  // namespace cudf::io::detail::json::experimental
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 6ee16f8866e..ffb4a7cd87b 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -148,7 +148,7 @@ struct escape_strings_fn {
         }
         continue;
       }
-      auto escaped_chars = cudf::io::json::experimental::detail::get_escaped_char(utf8_char);
+      auto escaped_chars = get_escaped_char(utf8_char);
       if (escaped_chars.first == '\0') {
         write_char(escaped_chars.second, d_buffer, bytes);
       } else {
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
index 5b7de667f61..e2d5959c19f 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -20,7 +20,7 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <io/json/experimental/read_json.hpp>
+#include <io/json/read_json.hpp>
 
 /**
  * @brief Base test fixture for JSON reader tests
@@ -37,7 +37,7 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  using namespace cudf::io::detail::json::experimental;
+  using namespace cudf::io::json::detail;
   using cudf::size_type;
   // assuming single source.
   size_t total_source_size = 0;
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index a7710036125..5c32131114d 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -79,14 +79,14 @@ TEST_F(JSONTypeCastTest, String)
   auto null_mask =
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
 
-  auto str_col = cudf::io::json::experimental::detail::parse_data(svs.data(),
-                                                                  svs.size(),
-                                                                  type,
-                                                                  std::move(null_mask),
-                                                                  0,
-                                                                  default_json_options().view(),
-                                                                  stream,
-                                                                  mr);
+  auto str_col = cudf::io::json::detail::parse_data(svs.data(),
+                                                    svs.size(),
+                                                    type,
+                                                    std::move(null_mask),
+                                                    0,
+                                                    default_json_options().view(),
+                                                    stream,
+                                                    mr);
 
   auto out_valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; });
@@ -115,14 +115,14 @@ TEST_F(JSONTypeCastTest, Int)
   auto null_mask =
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
 
-  auto col = cudf::io::json::experimental::detail::parse_data(svs.data(),
-                                                              svs.size(),
-                                                              type,
-                                                              std::move(null_mask),
-                                                              0,
-                                                              default_json_options().view(),
-                                                              stream,
-                                                              mr);
+  auto col = cudf::io::json::detail::parse_data(svs.data(),
+                                                svs.size(),
+                                                type,
+                                                std::move(null_mask),
+                                                0,
+                                                default_json_options().view(),
+                                                stream,
+                                                mr);
 
   auto expected =
     cudf::test::fixed_width_column_wrapper<int64_t>{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}};
@@ -158,14 +158,14 @@ TEST_F(JSONTypeCastTest, StringEscapes)
   auto null_mask =
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
 
-  auto col = cudf::io::json::experimental::detail::parse_data(svs.data(),
-                                                              svs.size(),
-                                                              type,
-                                                              std::move(null_mask),
-                                                              0,
-                                                              default_json_options().view(),
-                                                              stream,
-                                                              mr);
+  auto col = cudf::io::json::detail::parse_data(svs.data(),
+                                                svs.size(),
+                                                type,
+                                                std::move(null_mask),
+                                                0,
+                                                default_json_options().view(),
+                                                stream,
+                                                mr);
 
   auto expected = cudf::test::strings_column_wrapper{
     {"🚀", "Ａ🚀ＡＡ", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"},

From 15cc5011902f1026e04662e725b880f48d38ba8d Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 3 Aug 2023 19:41:57 -0400
Subject: [PATCH 027/230] Update `lists::contains` to experimental row
 comparator (#13810)

Part of #11844

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/13810
---
 cpp/src/lists/contains.cu | 229 +++++++++++++-------------------------
 1 file changed, 75 insertions(+), 154 deletions(-)

diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 9d39f2f9a90..df1d043bdb6 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -71,11 +71,13 @@ static auto constexpr is_supported_non_nested_type()
 /**
  * @brief Check if the given type is supported in `cudf::lists::contains`.
  */
-template <typename Element>
-auto constexpr is_supported_type()
-{
-  return is_supported_non_nested_type<Element>() || cudf::is_nested<Element>();
-}
+struct is_supported_type_fn {
+  template <typename Element>
+  auto constexpr operator()()
+  {
+    return is_supported_non_nested_type<Element>() || cudf::is_nested<Element>();
+  }
+};
 
 /**
  * @brief Return a pair of index iterators {begin, end} to loop through elements within a
@@ -105,60 +107,19 @@ __device__ auto element_index_pair_iter(size_type const size)
   }
 }
 
-/**
- * @brief Functor to perform searching for index of a key element in a given list, specialized
- * for non-nested types.
- */
-struct search_list_non_nested_types_fn {
-  duplicate_find_option const find_option;
-
-  template <typename Element, CUDF_ENABLE_IF(is_supported_non_nested_type<Element>())>
-  __device__ size_type operator()(list_device_view const list,
-                                  thrust::optional<Element> const key_opt) const
-  {
-    // A null list or null key will result in a null output row.
-    if (list.is_null() || !key_opt) { return NULL_SENTINEL; }
-
-    return find_option == duplicate_find_option::FIND_FIRST
-             ? search_list<true, Element>(list, *key_opt)
-             : search_list<false, Element>(list, *key_opt);
-  }
-
-  template <typename Element, CUDF_ENABLE_IF(!is_supported_non_nested_type<Element>())>
-  __device__ size_type operator()(list_device_view const, thrust::optional<Element> const) const
-  {
-    CUDF_UNREACHABLE("Unsupported type.");
-  }
-
- private:
-  template <bool forward, typename Element, CUDF_ENABLE_IF(is_supported_non_nested_type<Element>())>
-  static __device__ inline size_type search_list(list_device_view const list,
-                                                 Element const search_key)
-  {
-    auto const [begin, end] = element_index_pair_iter<forward>(list.size());
-    auto const found_iter =
-      thrust::find_if(thrust::seq, begin, end, [=] __device__(auto const idx) {
-        return !list.is_null(idx) &&
-               cudf::equality_compare(list.template element<Element>(idx), search_key);
-      });
-    // If the key is found, return its found position in the list from `found_iter`.
-    return found_iter == end ? NOT_FOUND_SENTINEL : *found_iter;
-  }
-};
-
 /**
  * @brief Functor to perform searching for index of a key element in a given list, specialized
  * for nested types.
  */
 template <typename KeyValidityIter, typename EqComparator>
-struct search_list_nested_types_fn {
+struct search_list_fn {
   duplicate_find_option const find_option;
   KeyValidityIter const key_validity_iter;
   EqComparator const d_comp;
 
-  search_list_nested_types_fn(duplicate_find_option const find_option,
-                              KeyValidityIter const key_validity_iter,
-                              EqComparator const& d_comp)
+  search_list_fn(duplicate_find_option const find_option,
+                 KeyValidityIter const key_validity_iter,
+                 EqComparator const& d_comp)
     : find_option(find_option), key_validity_iter(key_validity_iter), d_comp(d_comp)
   {
   }
@@ -168,13 +129,13 @@ struct search_list_nested_types_fn {
     // A null list or null key will result in a null output row.
     if (list.is_null() || !key_validity_iter[list.row_index()]) { return NULL_SENTINEL; }
 
-    return find_option == duplicate_find_option::FIND_FIRST ? search_list<true>(list)
-                                                            : search_list<false>(list);
+    return find_option == duplicate_find_option::FIND_FIRST ? search_list_op<true>(list)
+                                                            : search_list_op<false>(list);
   }
 
  private:
   template <bool forward>
-  __device__ inline size_type search_list(list_device_view const list) const
+  __device__ inline size_type search_list_op(list_device_view const list) const
   {
     using cudf::experimental::row::lhs_index_type;
     using cudf::experimental::row::rhs_index_type;
@@ -190,129 +151,90 @@ struct search_list_nested_types_fn {
   }
 };
 
-/**
- * @brief Function to search for key element(s) in the corresponding rows of a lists column,
- * specialized for non-nested types.
- */
-template <typename Element, typename InputIterator, typename OutputIterator>
-void index_of_non_nested_types(InputIterator input_it,
-                               size_type num_rows,
-                               OutputIterator output_it,
-                               column_view const& search_keys,
-                               bool search_keys_have_nulls,
-                               duplicate_find_option find_option,
-                               rmm::cuda_stream_view stream)
-{
-  auto const keys_cdv_ptr = column_device_view::create(search_keys, stream);
-  auto const keys_iter    = cudf::detail::make_optional_iterator<Element>(
-    *keys_cdv_ptr, nullate::DYNAMIC{search_keys_have_nulls});
-  thrust::transform(rmm::exec_policy(stream),
-                    input_it,
-                    input_it + num_rows,
-                    keys_iter,
-                    output_it,
-                    search_list_non_nested_types_fn{find_option});
-}
-
 /**
  * @brief Function to search for index of key element(s) in the corresponding rows of a lists
  * column, specialized for nested types.
  */
-template <typename InputIterator, typename OutputIterator>
-void index_of_nested_types(InputIterator input_it,
-                           size_type num_rows,
-                           OutputIterator output_it,
-                           column_view const& child,
-                           column_view const& search_keys,
-                           duplicate_find_option find_option,
-                           rmm::cuda_stream_view stream)
+template <typename InputIterator, typename OutputIterator, typename DeviceComp>
+void index_of(InputIterator input_it,
+              size_type num_rows,
+              OutputIterator output_it,
+              column_view const& child,
+              column_view const& search_keys,
+              duplicate_find_option find_option,
+              DeviceComp d_comp,
+              rmm::cuda_stream_view stream)
 {
-  auto const keys_tview  = cudf::table_view{{search_keys}};
-  auto const child_tview = table_view{{child}};
-  auto const has_nulls   = has_nested_nulls(child_tview) || has_nested_nulls(keys_tview);
-  auto const comparator =
-    cudf::experimental::row::equality::two_table_comparator(child_tview, keys_tview, stream);
-  auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
-
   auto const keys_dv_ptr       = column_device_view::create(search_keys, stream);
   auto const key_validity_iter = cudf::detail::make_validity_iterator<true>(*keys_dv_ptr);
   thrust::transform(rmm::exec_policy(stream),
                     input_it,
                     input_it + num_rows,
                     output_it,
-                    search_list_nested_types_fn{find_option, key_validity_iter, d_comp});
+                    search_list_fn{find_option, key_validity_iter, d_comp});
 }
 
 /**
- * @brief Dispatch functor to search for index of key element(s) in the corresponding rows of a
+ * @brief Dispatch function to search for index of key element(s) in the corresponding rows of a
  * lists column.
  */
-struct dispatch_index_of {
-  // SFINAE with conditional return type because we need to support device lambda in this function.
-  // This is required due to a limitation of nvcc.
-  template <typename Element>
-  std::enable_if_t<is_supported_type<Element>(), std::unique_ptr<column>> operator()(
-    lists_column_view const& lists,
-    column_view const& search_keys,
-    duplicate_find_option find_option,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
-  {
-    // Access the child column through `child()` method, not `get_sliced_child()`.
-    // This is because slicing offset has already been taken into account during row
-    // comparisons.
-    auto const child = lists.child();
+std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
+                                          column_view const& search_keys,
+                                          duplicate_find_option find_option,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(cudf::type_dispatcher(search_keys.type(), is_supported_type_fn{}),
+               "Unsupported type in `dispatch_index_of` function.");
+  // Access the child column through `child()` method, not `get_sliced_child()`.
+  // This is because slicing offset has already been taken into account during row
+  // comparisons.
+  auto const child = lists.child();
 
-    CUDF_EXPECTS(child.type() == search_keys.type(),
-                 "Type/Scale of search key does not match list column element type.",
-                 cudf::data_type_error);
-    CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
+  CUDF_EXPECTS(child.type() == search_keys.type(),
+               "Type/Scale of search key does not match list column element type.",
+               cudf::data_type_error);
+  CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
 
-    auto const search_keys_have_nulls = search_keys.has_nulls();
+  auto const search_keys_have_nulls = search_keys.has_nulls();
 
-    auto const num_rows = lists.size();
+  auto const num_rows = lists.size();
 
-    auto const lists_cdv_ptr = column_device_view::create(lists.parent(), stream);
-    auto const input_it      = cudf::detail::make_counting_transform_iterator(
-      size_type{0},
-      [lists = cudf::detail::lists_column_device_view{*lists_cdv_ptr}] __device__(auto const idx) {
-        return list_device_view{lists, idx};
-      });
+  auto const lists_cdv_ptr = column_device_view::create(lists.parent(), stream);
+  auto const input_it      = cudf::detail::make_counting_transform_iterator(
+    size_type{0},
+    [lists = cudf::detail::lists_column_device_view{*lists_cdv_ptr}] __device__(auto const idx) {
+      return list_device_view{lists, idx};
+    });
+
+  auto out_positions = make_numeric_column(
+    data_type{type_to_id<size_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto const output_it = out_positions->mutable_view().template begin<size_type>();
 
-    auto out_positions = make_numeric_column(
-      data_type{type_to_id<size_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
-    auto const output_it = out_positions->mutable_view().template begin<size_type>();
-
-    if constexpr (not cudf::is_nested<Element>()) {
-      index_of_non_nested_types<Element>(
-        input_it, num_rows, output_it, search_keys, search_keys_have_nulls, find_option, stream);
-    } else {  // list + struct
-      index_of_nested_types(input_it, num_rows, output_it, child, search_keys, find_option, stream);
-    }
-
-    if (search_keys_have_nulls || lists.has_nulls()) {
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        output_it,
-        output_it + num_rows,
-        [] __device__(auto const idx) { return idx != NULL_SENTINEL; },
-        stream,
-        mr);
-      out_positions->set_null_mask(std::move(null_mask), null_count);
-    }
-    return out_positions;
+  auto const keys_tview  = cudf::table_view{{search_keys}};
+  auto const child_tview = cudf::table_view{{child}};
+  auto const has_nulls   = has_nested_nulls(child_tview) || has_nested_nulls(keys_tview);
+  auto const comparator =
+    cudf::experimental::row::equality::two_table_comparator(child_tview, keys_tview, stream);
+  if (cudf::is_nested(search_keys.type())) {
+    auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
+    index_of(input_it, num_rows, output_it, child, search_keys, find_option, d_comp, stream);
+  } else {
+    auto const d_comp = comparator.equal_to<false>(nullate::DYNAMIC{has_nulls});
+    index_of(input_it, num_rows, output_it, child, search_keys, find_option, d_comp, stream);
   }
 
-  template <typename Element, typename SearchKeyType>
-  std::enable_if_t<!is_supported_type<Element>(), std::unique_ptr<column>> operator()(
-    lists_column_view const&,
-    SearchKeyType const&,
-    duplicate_find_option,
-    rmm::cuda_stream_view,
-    rmm::mr::device_memory_resource*) const
-  {
-    CUDF_FAIL("Unsupported type in `dispatch_index_of` functor.");
+  if (search_keys_have_nulls || lists.has_nulls()) {
+    auto [null_mask, null_count] = cudf::detail::valid_if(
+      output_it,
+      output_it + num_rows,
+      [] __device__(auto const idx) { return idx != NULL_SENTINEL; },
+      stream,
+      mr);
+    out_positions->set_null_mask(std::move(null_mask), null_count);
   }
-};
+  return out_positions;
+}
 
 /**
  * @brief Converts key-positions vector (from `index_of()`) to a BOOL8 vector, indicating if
@@ -376,8 +298,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
 {
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
-  return cudf::type_dispatcher(
-    search_keys.type(), dispatch_index_of{}, lists, search_keys, find_option, stream, mr);
+  return dispatch_index_of(lists, search_keys, find_option, stream, mr);
 }
 
 std::unique_ptr<column> contains(lists_column_view const& lists,

From 073bf83b3b8e74826a036fafb2837afe9d4e09f3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 4 Aug 2023 10:02:18 -0500
Subject: [PATCH 028/230] Raise error when mixed types are being constructed
 (#13816)

This PR raises error when a mixed type data is being constructed instead of type-casting `nan` values to string nans (`'nan'`)

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13816
---
 python/cudf/cudf/core/column/column.py | 6 ++----
 python/cudf/cudf/tests/test_series.py  | 5 +++++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index da3d04c15c0..57f6c80fb05 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2161,9 +2161,7 @@ def as_column(
             if dtype is not None:
                 data = data.astype(dtype)
         elif arb_dtype.kind in ("O", "U"):
-            data = as_column(
-                pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype
-            )
+            data = as_column(pa.array(arbitrary), dtype=arbitrary.dtype)
             # There is no cast operation available for pa.Array from int to
             # str, Hence instead of handling in pa.Array block, we
             # will have to type-cast here.
@@ -2422,7 +2420,7 @@ def _construct_array(
         if (
             dtype is None
             and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and infer_dtype(arbitrary)
+            and infer_dtype(arbitrary, skipna=False)
             in (
                 "mixed",
                 "mixed-integer",
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 58eaebae925..6b009d7e913 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2219,3 +2219,8 @@ def __getitem__(self, key):
 
     with pytest.raises(TypeError):
         cudf.Series(A())
+
+
+def test_series_constructor_error_mixed_type():
+    with pytest.raises(pa.ArrowTypeError):
+        cudf.Series(["abc", np.nan, "123"], nan_as_null=False)

From d8bf9d2534d2e0b452ee4886fd8b9bd30d674dee Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 4 Aug 2023 09:54:34 -0700
Subject: [PATCH 029/230] Avoid use of CUDF_EXPECTS in libcudf unit tests
 outside of helper functions with return values (#13812)

`CUDF_EXPECTS` should be used to report internal errors in libcudf. Google test framework has macros that should be used to report failures. The `ASSERT_` macros replace `CUDF_EXPECTS` as they exit the current function (not by throwing as `CUDF_EXPECTS`, but the effect is very similar).

This PR replaces the erroneous use of CUDF_EXPECTS with ASSERT_XYZ macros in unit tests.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13812
---
 cpp/tests/copying/concatenate_tests.cpp   |  2 +-
 cpp/tests/copying/split_tests.cpp         | 11 ++++----
 cpp/tests/groupby/tdigest_tests.cu        |  8 +++---
 cpp/tests/interop/from_arrow_test.cpp     |  9 +++----
 cpp/tests/interop/to_arrow_test.cpp       | 10 +++----
 cpp/tests/io/arrow_io_source_test.cpp     |  2 +-
 cpp/tests/io/csv_test.cpp                 |  3 +--
 cpp/tests/io/json_test.cpp                |  2 +-
 cpp/tests/io/parquet_test.cpp             | 33 +++++++++++------------
 cpp/tests/transform/row_bit_count_test.cu |  4 +--
 cpp/tests/transpose/transpose_test.cpp    |  6 ++---
 11 files changed, 40 insertions(+), 50 deletions(-)

diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 7701ca1ba56..0c6394637fc 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -842,7 +842,7 @@ TEST_F(StructsColumnTest, ConcatenateEmptyStructs)
 
   // concatenate
   auto result = cudf::concatenate(std::vector<column_view>({*first, *second, *third, *fourth}));
-  CUDF_EXPECTS(result->size() == expected->size(), "column size changed after concat");
+  ASSERT_EQ(result->size(), expected->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
 }
 
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index da85242410b..7a5c738dc12 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -1229,7 +1229,7 @@ void split_nested_list_of_structs(SplitFunc Split, CompareFunc Compare, bool spl
     // these outputs correctly, this should be safe.
     auto result   = Split(*outer_list, splits);
     auto expected = cudf::split(static_cast<cudf::column_view>(*outer_list), splits);
-    CUDF_EXPECTS(result.size() == expected.size(), "Split result size mismatch");
+    ASSERT_EQ(result.size(), expected.size());
 
     for (std::size_t index = 0; index < result.size(); index++) {
       Compare(expected[index], result[index]);
@@ -1591,8 +1591,7 @@ TEST_F(ContiguousSplitUntypedTest, ValidityRepartition)
   cudf::table_view t({*col});
   auto result   = cudf::contiguous_split(t, {num_rows / 2});
   auto expected = cudf::split(t, {num_rows / 2});
-  CUDF_EXPECTS(result.size() == expected.size(),
-               "Mismatch in split results in ValidityRepartition test");
+  ASSERT_EQ(result.size(), expected.size());
 
   for (size_t idx = 0; idx < result.size(); idx++) {
     CUDF_TEST_EXPECT_TABLES_EQUAL(result[idx].table, expected[idx]);
@@ -1696,14 +1695,14 @@ TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn)
   {
     std::vector<cudf::size_type> splits;
     auto result = cudf::contiguous_split(src_table, splits);
-    CUDF_EXPECTS(result.size() == 1, "Incorrect returned contiguous_split result size!");
+    ASSERT_EQ(result.size(), 1);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[0].table);
   }
 
   {
     auto result = do_chunked_pack(src_table);
-    CUDF_EXPECTS(result.size() == 1, "Incorrect returned contiguous_split result size!");
+    ASSERT_EQ(result.size(), 1);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[0].table);
   }
@@ -1711,7 +1710,7 @@ TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn)
   {
     std::vector<cudf::size_type> splits{0, 0, 0, 0};
     auto result = cudf::contiguous_split(src_table, splits);
-    CUDF_EXPECTS(result.size() == 5, "Incorrect returned contiguous_split result size!");
+    ASSERT_EQ(result.size(), 5);
 
     for (size_t idx = 0; idx < result.size(); idx++) {
       CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[idx].table);
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 1d2835675f9..97edc1c45a7 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -265,7 +265,7 @@ TEST_F(TDigestMergeTest, Grouped)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(
     cudf::data_type{cudf::type_id::FLOAT64});
-  CUDF_EXPECTS(values->size() == 750000, "Unexpected distribution size");
+  ASSERT_EQ(values->size(), 750000);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
     cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
@@ -321,7 +321,7 @@ TEST_F(TDigestMergeTest, Grouped)
     requests.push_back({*merge_input, std::move(aggregations)});
     auto result = gb.aggregate(requests);
 
-    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    ASSERT_EQ(result.second[0].results[0]->size(), 2);
     cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
 
     // verify centroids
@@ -376,7 +376,7 @@ TEST_F(TDigestMergeTest, Grouped)
     requests.push_back({*merge_input, std::move(aggregations)});
     auto result = gb.aggregate(requests);
 
-    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    ASSERT_EQ(result.second[0].results[0]->size(), 2);
     cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
 
     // verify centroids
@@ -423,7 +423,7 @@ TEST_F(TDigestMergeTest, Grouped)
     requests.push_back({*merge_input, std::move(aggregations)});
     auto result = gb.aggregate(requests);
 
-    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    ASSERT_EQ(result.second[0].results[0]->size(), 2);
     cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
 
     // verify centroids
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 12bc031d56f..9a5cc3733af 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -86,8 +86,8 @@ TEST_F(FromArrowTest, DateTimeTable)
   std::shared_ptr<arrow::Array> arr;
   arrow::TimestampBuilder timestamp_builder(arrow::timestamp(arrow::TimeUnit::type::MILLI),
                                             arrow::default_memory_pool());
-  CUDF_EXPECTS(timestamp_builder.AppendValues(data).ok(), "Failed to append values");
-  CUDF_EXPECTS(timestamp_builder.Finish(&arr).ok(), "Failed to build array");
+  ASSERT_TRUE(timestamp_builder.AppendValues(data).ok());
+  ASSERT_TRUE(timestamp_builder.Finish(&arr).ok());
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
@@ -119,9 +119,8 @@ TYPED_TEST(FromArrowTestDurationsTest, DurationTable)
     default: CUDF_FAIL("Unsupported duration unit in arrow");
   }
   arrow::DurationBuilder duration_builder(duration(arrow_unit), arrow::default_memory_pool());
-  CUDF_EXPECTS(duration_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok(),
-               "Failed to append values");
-  CUDF_EXPECTS(duration_builder.Finish(&arr).ok(), "Failed to build array");
+  ASSERT_TRUE(duration_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok());
+  ASSERT_TRUE(duration_builder.Finish(&arr).ok());
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 6fc9f47f1f8..97d80984272 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -188,9 +188,8 @@ TEST_F(ToArrowTest, DateTimeTable)
   std::shared_ptr<arrow::Array> arr;
   arrow::TimestampBuilder timestamp_builder(timestamp(arrow::TimeUnit::type::MILLI),
                                             arrow::default_memory_pool());
-  CUDF_EXPECTS(timestamp_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok(),
-               "Failed to append values");
-  CUDF_EXPECTS(timestamp_builder.Finish(&arr).ok(), "Failed to build array");
+  ASSERT_TRUE(timestamp_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok());
+  ASSERT_TRUE(timestamp_builder.Finish(&arr).ok());
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
@@ -222,9 +221,8 @@ TYPED_TEST(ToArrowTestDurationsTest, DurationTable)
     default: CUDF_FAIL("Unsupported duration unit in arrow");
   }
   arrow::DurationBuilder duration_builder(duration(arrow_unit), arrow::default_memory_pool());
-  CUDF_EXPECTS(duration_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok(),
-               "Failed to append values");
-  CUDF_EXPECTS(duration_builder.Finish(&arr).ok(), "Failed to build array");
+  ASSERT_TRUE(duration_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok());
+  ASSERT_TRUE(duration_builder.Finish(&arr).ok());
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index fb9e20843ed..ed297d2da42 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -97,7 +97,7 @@ TEST_F(ArrowIOTest, S3FileSystem)
 
     close_s3_func = reinterpret_cast<decltype(close_s3_func)>(
       dlsym(whole_app, "_ZN5arrow2fs17EnsureS3FinalizedEv"));
-    if (close_s3_func) { CUDF_EXPECTS(close_s3_func().ok(), "Failed to finalize s3 filesystem"); }
+    if (close_s3_func) { EXPECT_TRUE(close_s3_func().ok()); }
     dlclose(whole_app);
   }
 }
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 9da97c00712..2b501f45b47 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -169,8 +169,7 @@ void check_float_column(cudf::column_view const& col_lhs,
 
   CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUIVALENT(col_lhs,
                                                 (wrapper<T>{data.begin(), data.end(), validity}));
-  CUDF_EXPECTS(col_lhs.null_count() == 0 and col_rhs.null_count() == 0,
-               "All elements should be valid");
+  EXPECT_TRUE(col_lhs.null_count() == 0 and col_rhs.null_count() == 0);
   EXPECT_THAT(cudf::test::to_host<T>(col_lhs).first,
               ::testing::Pointwise(FloatNearPointwise(tol), data));
 }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 97d5846294a..5a30be755d3 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -149,7 +149,7 @@ void check_float_column(cudf::column_view const& col,
                         valid_t const& validity)
 {
   CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(col, (wrapper<T>{data.begin(), data.end(), validity}));
-  CUDF_EXPECTS(col.null_count() == 0, "All elements should be valid");
+  EXPECT_EQ(col.null_count(), 0);
   EXPECT_THAT(cudf::test::to_host<T>(col).first,
               ::testing::Pointwise(FloatNearPointwise(1e-6), data));
 }
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index ea2bad0cabf..a86190239fe 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -213,12 +213,10 @@ void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
   auto const ender = reinterpret_cast<cudf::io::parquet::file_ender_s const*>(ender_buffer->data());
 
   // checks for valid header, footer, and file length
-  CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
-  CUDF_EXPECTS(header->magic == cudf::io::parquet::parquet_magic &&
-                 ender->magic == cudf::io::parquet::parquet_magic,
-               "Corrupted header or footer");
-  CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
-               "Incorrect footer length");
+  ASSERT_GT(len, header_len + ender_len);
+  ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic &&
+              ender->magic == cudf::io::parquet::parquet_magic);
+  ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len));
 
   // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1"
   // seek backwards from the end of the file (footer_length + 8 bytes of ender)
@@ -228,7 +226,7 @@ void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
 
   // returns true on success
   bool res = cp.read(file_meta_data);
-  CUDF_EXPECTS(res, "Cannot parse file metadata");
+  ASSERT_TRUE(res);
 }
 
 // returns the number of bits used for dictionary encoding data at the given page location.
@@ -1622,7 +1620,7 @@ TEST_F(ParquetChunkedWriterTest, LargeTables)
   cudf::io::chunked_parquet_writer_options args =
     cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
   auto md = cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2).close();
-  CUDF_EXPECTS(!md, "The return value should be null.");
+  ASSERT_EQ(md, nullptr);
 
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
@@ -1653,7 +1651,7 @@ TEST_F(ParquetChunkedWriterTest, ManyTables)
     writer.write(tbl);
   });
   auto md = writer.close({"dummy/path"});
-  CUDF_EXPECTS(md, "The returned metadata should not be null.");
+  ASSERT_NE(md, nullptr);
 
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
@@ -3660,10 +3658,10 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   cudf::io::parquet::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  CUDF_EXPECTS(fmd.row_groups.size() > 0, "No row groups found");
-  CUDF_EXPECTS(fmd.row_groups[0].columns.size() == 1, "Invalid number of columns");
+  ASSERT_GT(fmd.row_groups.size(), 0);
+  ASSERT_EQ(fmd.row_groups[0].columns.size(), 1);
   auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data;
-  CUDF_EXPECTS(first_chunk.data_page_offset > 0, "Invalid location for first data page");
+  ASSERT_GT(first_chunk.data_page_offset, 0);
 
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
@@ -3696,10 +3694,10 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   cudf::io::parquet::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  CUDF_EXPECTS(fmd.row_groups.size() > 0, "No row groups found");
-  CUDF_EXPECTS(fmd.row_groups[0].columns.size() == 1, "Invalid number of columns");
+  ASSERT_GT(fmd.row_groups.size(), 0);
+  ASSERT_EQ(fmd.row_groups[0].columns.size(), 1);
   auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data;
-  CUDF_EXPECTS(first_chunk.data_page_offset > 0, "Invalid location for first data page");
+  ASSERT_GT(first_chunk.data_page_offset, 0);
 
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
@@ -4005,11 +4003,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
   cudf::io::parquet::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  CUDF_EXPECTS(fmd.row_groups.size() > 0, "No row groups found");
+  ASSERT_GT(fmd.row_groups.size(), 0);
 
   auto const& columns = fmd.row_groups[0].columns;
-  CUDF_EXPECTS(columns.size() == static_cast<size_t>(expected.num_columns()),
-               "Invalid number of columns");
+  ASSERT_EQ(columns.size(), static_cast<size_t>(expected.num_columns()));
 
   // now check that the boundary order for chunk 1 is ascending,
   // chunk 2 is descending, and chunk 3 is unordered
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 4832cdf816f..236407e62f3 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -750,7 +750,7 @@ TEST_F(RowBitCount, EmptyTable)
   {
     cudf::table_view empty;
     auto result = cudf::row_bit_count(empty);
-    CUDF_EXPECTS(result != nullptr && result->size() == 0, "Expected an empty column");
+    EXPECT_TRUE(result != nullptr && result->size() == 0);
   }
 
   {
@@ -759,6 +759,6 @@ TEST_F(RowBitCount, EmptyTable)
     cudf::table_view empty({*strings, *ints});
 
     auto result = cudf::row_bit_count(empty);
-    CUDF_EXPECTS(result != nullptr && result->size() == 0, "Expected an empty column");
+    EXPECT_TRUE(result != nullptr && result->size() == 0);
   }
 }
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index 93cc4aaa100..cf46dd74138 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -146,12 +146,10 @@ void run_test(size_t ncols, size_t nrows, bool add_nulls)
   auto result      = transpose(input_view);
   auto result_view = std::get<1>(result);
 
-  CUDF_EXPECTS(result_view.num_columns() == expected_view.num_columns(),
-               "Expected same number of columns");
+  ASSERT_EQ(result_view.num_columns(), expected_view.num_columns());
   for (cudf::size_type i = 0; i < result_view.num_columns(); ++i) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view.column(i), expected_view.column(i));
-    CUDF_EXPECTS(result_view.column(i).null_count() == expected_nulls[i],
-                 "Expected correct null count");
+    EXPECT_EQ(result_view.column(i).null_count(), expected_nulls[i]);
   }
 }
 

From 8370cbe7f21872ea7106a7a8577a1b1a124608ff Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 4 Aug 2023 14:30:15 -0500
Subject: [PATCH 030/230] Set native handles to null on close in Java wrapper
 classes (#13818)

This updates a few Java wrapper classes that track native resources to zero the native handles when they are closed to make it easier to track down use-after-close errors.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13818
---
 java/src/main/java/ai/rapids/cudf/ChunkedPack.java      | 6 +++++-
 java/src/main/java/ai/rapids/cudf/ColumnVector.java     | 1 +
 java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 7 +++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ChunkedPack.java b/java/src/main/java/ai/rapids/cudf/ChunkedPack.java
index 90ec05cdbbf..d44c1322902 100644
--- a/java/src/main/java/ai/rapids/cudf/ChunkedPack.java
+++ b/java/src/main/java/ai/rapids/cudf/ChunkedPack.java
@@ -88,7 +88,11 @@ public PackedColumnMetadata buildMetadata() {
 
   @Override
   public void close() {
-    chunkedPackDelete(nativePtr);
+    try {
+      chunkedPackDelete(nativePtr);
+    } finally {
+      nativePtr = 0;
+    }
   }
 
   private static native long chunkedPackGetTotalContiguousSize(long nativePtr);
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index a3fca6777a2..0595d58c7cc 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -264,6 +264,7 @@ public synchronized void close() {
       eventHandler.onClosed(this, refCount);
     }
     if (refCount == 0) {
+      super.close();
       offHeap.clean(false);
     } else if (refCount < 0) {
       offHeap.logRefCountDebug("double free " + this);
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index e1da4b6a1ea..b462d70ccd2 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -6824,4 +6824,11 @@ public void testColumnViewNullCount() {
       assertEquals(vector.getNullCount(), view.getNullCount());
     }
   }
+
+  @Test
+  public void testUseAfterFree() {
+    ColumnVector vector = ColumnVector.fromBoxedInts(1, 2, 3);
+    vector.close();
+    assertThrows(NullPointerException.class, vector::getDeviceMemorySize);
+  }
 }

From a8ef4d4e49d584d1553283f6cf9aef17baaca62f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 4 Aug 2023 16:19:33 -0700
Subject: [PATCH 031/230] Update to Cython 3.0.0 (#13777)

This PR contains the minimal set of changes to compile using Cython 3 without warnings. Future PRs can be made to take advantage of new or improved features.

The specific changes are:
- Ensuring `nogil` always comes after `except`. `except * nogil` is a compile-time error in Cython 3
- Removing any extern cdef functions that uses C++ rvalues. These were never supported by Cython, but prior to 3.0 they were silently ignored whereas now Cython throws warnings during compilation
- Relative imports are no longer off by one level in pxd files and must be adjusted accordingly (see https://github.com/cython/cython/issues/3442)

There are a large number of outstanding warnings due to https://github.com/NVIDIA/cuda-python/issues/44. cuda-python for CUDA 12 has the necessary fix, but we will need a cuda-python 11.8.* bugfix with a backport to make those warnings go away.

There are also warnings coming from pyarrow due to https://github.com/apache/arrow/issues/34564. pyarrow 12 contains the necessary fixes, so this issues should be resolved once #13728 is merged.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)
  - Benjamin Zaitlen (https://github.com/quasiben)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/13777
---
 ci/test_wheel_dask_cudf.sh                       |  2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml |  2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml |  2 +-
 conda/recipes/cudf/meta.yaml                     |  2 +-
 conda/recipes/cudf_kafka/meta.yaml               |  2 +-
 dependencies.yaml                                |  2 +-
 python/cudf/cudf/_lib/cpp/column/column.pxd      |  8 +-------
 python/cudf/cudf/_lib/cpp/column/column_view.pxd |  3 +--
 python/cudf/cudf/_lib/cpp/table/table.pxd        |  3 +--
 python/cudf/cudf/_lib/csv.pyx                    |  2 +-
 python/cudf/cudf/_lib/io/datasource.pxd          |  4 ++--
 python/cudf/cudf/_lib/io/datasource.pyx          |  4 ++--
 python/cudf/cudf/_lib/json.pyx                   |  4 ++--
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd     | 14 +++++---------
 python/cudf/pyproject.toml                       |  2 +-
 python/cudf_kafka/pyproject.toml                 |  2 +-
 16 files changed, 23 insertions(+), 35 deletions(-)

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 9e190ea9321..38fce04459e 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -11,7 +11,7 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@2023.7.1 git+https://github.com/dask/distributed.git@2023.7.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.08
+python -m pip install git+https://github.com/dask/dask.git@2023.7.1 git+https://github.com/dask/distributed.git@2023.7.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8f04f7c06b8..06a3635bb05 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -22,7 +22,7 @@ dependencies:
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=0.29,<0.30
+- cython>=3.0.0
 - dask-core==2023.7.1
 - dask-cuda==23.10.*
 - dask==2023.7.1
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 19ca856df20..9273dd14867 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=0.29,<0.30
+- cython>=3.0.0
 - dask-core==2023.7.1
 - dask-cuda==23.10.*
 - dask==2023.7.1
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 612d2b177f6..a909b72c878 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,7 +57,7 @@ requirements:
   host:
     - protobuf ==4.21.*
     - python
-    - cython >=0.29,<0.30
+    - cython >=3.0.0
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index fff5bf1e840..ec0cc402511 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -43,7 +43,7 @@ requirements:
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - python
-    - cython >=0.29,<0.30
+    - cython >=3.0.0
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index b0022e8564c..3e5e703c239 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -250,7 +250,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=0.29,<0.30
+          - cython>=3.0.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==12.0.1.*
diff --git a/python/cudf/cudf/_lib/cpp/column/column.pxd b/python/cudf/cudf/_lib/cpp/column/column.pxd
index 205a1548c54..136f1d795a9 100644
--- a/python/cudf/cudf/_lib/cpp/column/column.pxd
+++ b/python/cudf/cudf/_lib/cpp/column/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -20,12 +20,6 @@ cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
         column() except +
         column(const column& other) except +
 
-        column(
-            data_type dtype,
-            size_type size,
-            device_buffer&& data
-        ) except +
-
         column(column_view view) except +
 
         size_type size() except +
diff --git a/python/cudf/cudf/_lib/cpp/column/column_view.pxd b/python/cudf/cudf/_lib/cpp/column/column_view.pxd
index 39c1c958531..edd013d9340 100644
--- a/python/cudf/cudf/_lib/cpp/column/column_view.pxd
+++ b/python/cudf/cudf/_lib/cpp/column/column_view.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
@@ -12,7 +12,6 @@ cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil:
         column_view(const column_view& other) except +
 
         column_view& operator=(const column_view&) except +
-        column_view& operator=(column_view&&) except +
 
         column_view(
             data_type type,
diff --git a/python/cudf/cudf/_lib/cpp/table/table.pxd b/python/cudf/cudf/_lib/cpp/table/table.pxd
index d7f3de76c63..ac93e3def19 100644
--- a/python/cudf/cudf/_lib/cpp/table/table.pxd
+++ b/python/cudf/cudf/_lib/cpp/table/table.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
@@ -11,7 +11,6 @@ from cudf._lib.cpp.types cimport size_type
 cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
     cdef cppclass table:
         table(const table&) except +
-        table(vector[unique_ptr[column]]&& columns) except +
         table(table_view) except +
         size_type num_columns() except +
         size_type num_rows() except +
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index df6ed89ac7e..3f275e2635f 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -543,7 +543,7 @@ def write_csv(
         )
 
 
-cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
+cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
     # TODO: Remove this work-around Dictionary types
     # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/io/datasource.pxd
index a7a3731a0e6..e8fe79d2685 100644
--- a/python/cudf/cudf/_lib/io/datasource.pxd
+++ b/python/cudf/cudf/_lib/io/datasource.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 
@@ -6,7 +6,7 @@ from cudf._lib.cpp.io.types cimport arrow_io_source, datasource
 
 
 cdef class Datasource:
-    cdef datasource* get_datasource(self) nogil except *
+    cdef datasource* get_datasource(self) except * nogil
 
 cdef class NativeFileDatasource(Datasource):
     cdef shared_ptr[arrow_io_source] c_datasource
diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/io/datasource.pyx
index 7402779a6ac..b39a1aee9b8 100644
--- a/python/cudf/cudf/_lib/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/io/datasource.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 from pyarrow.includes.libarrow cimport CRandomAccessFile
@@ -8,7 +8,7 @@ from cudf._lib.cpp.io.types cimport arrow_io_source, datasource
 
 
 cdef class Datasource:
-    cdef datasource* get_datasource(self) nogil except *:
+    cdef datasource* get_datasource(self) except * nogil:
         with gil:
             raise NotImplementedError("get_datasource() should not "
                                       + "be directly invoked here")
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index af4232a8734..611baed7fd7 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -211,7 +211,7 @@ def write_json(
         )
 
 
-cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
+cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
     cdef schema_element s_element
     cdef data_type lib_type
     if cudf.api.types.is_categorical_dtype(dtype):
@@ -236,7 +236,7 @@ cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
     return s_element
 
 
-cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
+cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
     if cudf.api.types.is_categorical_dtype(dtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 2d2bfe8ef8e..b4f8bfad4fb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -1,15 +1,11 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 # TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
-# Relative Cython imports always look one level too high. This is a known bug
-# https://github.com/cython/cython/issues/3442
-# that is fixed in Cython 3
-# https://github.com/cython/cython/pull/4552
-from .pylibcudf cimport copying
-from .pylibcudf.column cimport Column
-from .pylibcudf.gpumemoryview cimport gpumemoryview
-from .pylibcudf.table cimport Table
-from .pylibcudf.types cimport DataType, TypeId
+from . cimport copying
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+from .types cimport DataType, TypeId
 
 __all__ = [
     "Column",
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 5cd63893844..574769f68d1 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "cmake>=3.26.4",
-    "cython>=0.29,<0.30",
+    "cython>=3.0.0",
     "ninja",
     "numpy>=1.21",
     "protoc-wheel",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 081a2f69800..a6ef867451b 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "cython>=0.29,<0.30",
+    "cython>=3.0.0",
     "numpy>=1.21",
     "pyarrow==12.0.1.*",
     "setuptools",

From 7eb40718c2e7f923b4e28d2ca1c320e82f983360 Mon Sep 17 00:00:00 2001
From: Robert Maynard <robertjmaynard@gmail.com>
Date: Mon, 7 Aug 2023 11:15:58 -0400
Subject: [PATCH 032/230] No need to dlsym EnsureS3Finalized we can call it
 directly (#13819)

Previously I didn't realize that the `ARROW_S3` define was provided to consumers via `arrow/util/config.h`. We can remove the dlopen hack and just guard the entire S3 logic.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13819
---
 cpp/tests/io/arrow_io_source_test.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index ed297d2da42..89600d4cb46 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -27,6 +27,7 @@
 
 #include <arrow/filesystem/filesystem.h>
 #include <arrow/io/api.h>
+#include <arrow/util/config.h>
 
 #include <fstream>
 #include <memory>
@@ -87,19 +88,16 @@ TEST_F(ArrowIOTest, S3FileSystem)
     ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
     ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
   }
+
+#ifdef ARROW_S3
   if (!s3_unsupported) {
     // Verify that we are using Arrow with S3, and call finalize
     // https://github.com/apache/arrow/issues/36974
     // This needs to be in a separate conditional to ensure we call
     // finalize after all arrow_io_source instances have been deleted.
-    void* whole_app                                       = dlopen(NULL, RTLD_LAZY);
-    decltype(arrow::fs::EnsureS3Finalized)* close_s3_func = nullptr;
-
-    close_s3_func = reinterpret_cast<decltype(close_s3_func)>(
-      dlsym(whole_app, "_ZN5arrow2fs17EnsureS3FinalizedEv"));
-    if (close_s3_func) { EXPECT_TRUE(close_s3_func().ok()); }
-    dlclose(whole_app);
+    [[maybe_unused]] auto _ = arrow::fs::EnsureS3Finalized();
   }
+#endif
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From e92de8113d186389ec867bd7957288b82e439da2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 7 Aug 2023 09:15:00 -0700
Subject: [PATCH 033/230] Fix all warnings in Python docs (#13789)

The Sphinx documentation has historically been warning-filled, which makes it difficult to identify when there are real issues like missing APIs. This PR fixes all the current issues and converts warnings to errors during the build, ensuring that doc builds are reliable indicators of issues in the future.

I will say that there are a few changes that may not be exactly what we want, particularly in cases of including APIs that may not be documented in exactly the same way in pandas. However, I think we'd be better off merging this PR so that we can get to a 0 warnings state and then work through further improvements in follow-ups where the build will be more robust.

Here is an inexhaustive list of the most significant changes:
- Adds all missing BaseIndex APIs. The goal of BaseIndex is to provide an abstract interface defining all functions that should match pandas.Index, but up until now some methods were missing. This PR does not implement any new ones, it either lifts existing implementations up from subclasses (where those implementations are generic for all Index types) or it simply defines them as returning NotImplemented. The result is that all methods at least exist so that docs don't complain.
- Cleans up the listed APIs in rst files so that all existing APIs are included somewhere and no nonexistent APIs are listed anywhere. APIs that don't have an exact equivalent in the pandas docs are given a new home in these docs. That includes pieces like extension dtypes, which were previously documented in the user guide and therefore weren't part of any summary list (causing warnings).
- Fixed missing dependencies for doc notebooks.
- Fixed various formatting issues with docstrings, especially around bulleted lists that were missing the requisite spacing to be rendered correctly.
- Fixing header ordering (going from level 1 to level 3 headings is a warning in Sphinx) and links in notebooks.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13789
---
 ci/build_docs.sh                              |  16 +-
 .../all_cuda-118_arch-x86_64.yaml             |   1 +
 .../all_cuda-120_arch-x86_64.yaml             |   1 +
 dependencies.yaml                             |   3 +
 docs/cudf/Makefile                            |   2 +-
 docs/cudf/source/api_docs/dataframe.rst       |  11 +-
 .../cudf/source/api_docs/extension_dtypes.rst | 176 +++++++++++++++
 docs/cudf/source/api_docs/index.rst           |   1 +
 docs/cudf/source/api_docs/index_objects.rst   |  20 +-
 docs/cudf/source/api_docs/io.rst              |   2 +
 docs/cudf/source/api_docs/list_handling.rst   |   6 +
 docs/cudf/source/api_docs/series.rst          |  20 ++
 docs/cudf/source/api_docs/string_handling.rst |   6 +
 docs/cudf/source/api_docs/struct_handling.rst |   6 +
 .../cudf/source/api_docs/subword_tokenize.rst |   1 +
 docs/cudf/source/conf.py                      |  19 +-
 docs/cudf/source/user_guide/10min.ipynb       |  74 ++-----
 .../user_guide/cudf.CategoricalDtype.rst      |  19 --
 .../user_guide/cudf.Decimal128Dtype.rst       |  20 --
 .../source/user_guide/cudf.Decimal32Dtype.rst |  20 --
 .../source/user_guide/cudf.Decimal64Dtype.rst |  20 --
 .../cudf/source/user_guide/cudf.ListDtype.rst |  19 --
 .../source/user_guide/cudf.StructDtype.rst    |  18 --
 .../cudf/source/user_guide/cupy-interop.ipynb |  14 +-
 docs/cudf/source/user_guide/data-types.md     |  41 ++--
 docs/cudf/source/user_guide/groupby.md        |   5 +-
 .../source/user_guide/guide-to-udfs.ipynb     |   2 +-
 .../source/user_guide/pandas-comparison.md    |   2 +-
 docs/dask_cudf/Makefile                       |   3 +-
 python/cudf/cudf/core/_base_index.py          | 204 ++++++++++++++----
 python/cudf/cudf/core/column/string.py        |  14 +-
 python/cudf/cudf/core/dataframe.py            |  17 +-
 python/cudf/cudf/core/dtypes.py               |  24 ++-
 python/cudf/cudf/core/frame.py                |  84 +-------
 python/cudf/cudf/core/index.py                |  32 +--
 python/cudf/cudf/core/indexed_frame.py        |  71 +++++-
 python/cudf/cudf/core/multiindex.py           |  13 +-
 python/cudf/cudf/core/series.py               |  20 ++
 python/cudf/cudf/core/single_column_frame.py  |  24 +--
 39 files changed, 640 insertions(+), 411 deletions(-)
 create mode 100644 docs/cudf/source/api_docs/extension_dtypes.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.CategoricalDtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.ListDtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.StructDtype.rst

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 52e9419c82d..1ed047a500b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -38,20 +38,20 @@ popd
 
 rapids-logger "Build Python docs"
 pushd docs/cudf
-sphinx-build -b dirhtml source _html
-sphinx-build -b text source _text
+make dirhtml
+make text
 mkdir -p "${RAPIDS_DOCS_DIR}/cudf/"{html,txt}
-mv _html/* "${RAPIDS_DOCS_DIR}/cudf/html"
-mv _text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
+mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
+mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
 popd
 
 rapids-logger "Build dask-cuDF Sphinx docs"
 pushd docs/dask_cudf
-sphinx-build -b dirhtml source _html
-sphinx-build -b text source _text
+make dirhtml
+make text
 mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/"{html,txt}
-mv _html/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
-mv _text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
+mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
+mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
 popd
 
 rapids-upload-docs
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 06a3635bb05..dca0e3fe901 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -46,6 +46,7 @@ dependencies:
 - libkvikio==23.10.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==23.10.*
+- make
 - mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 9273dd14867..a9d0b260aee 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -45,6 +45,7 @@ dependencies:
 - libkvikio==23.10.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==23.10.*
+- make
 - mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
diff --git a/dependencies.yaml b/dependencies.yaml
index 3e5e703c239..0da5dbbb5ad 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -374,12 +374,15 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
+          - dask-cuda==23.10.*
           - doxygen=1.8.20
+          - make
           - myst-nb
           - nbsphinx
           - numpydoc
           - pandoc
           - pydata-sphinx-theme
+          - scipy
           - sphinx
           - sphinx-autobuild
           - sphinx-copybutton
diff --git a/docs/cudf/Makefile b/docs/cudf/Makefile
index e513c89a561..0b3a6876a5a 100644
--- a/docs/cudf/Makefile
+++ b/docs/cudf/Makefile
@@ -2,7 +2,7 @@
 #
 
 # You can set these variables from the command line.
-SPHINXOPTS    = -n -v
+SPHINXOPTS    = -n -v -W --keep-going
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = cudf
 SOURCEDIR     = source
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index f0d4d40ae65..642996f9379 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -149,9 +149,9 @@ Computations / descriptive stats
    DataFrame.quantile
    DataFrame.rank
    DataFrame.round
+   DataFrame.scale
    DataFrame.skew
    DataFrame.sum
-   DataFrame.sum_of_squares
    DataFrame.std
    DataFrame.var
    DataFrame.nunique
@@ -219,6 +219,7 @@ Reshaping, sorting, transposing
    DataFrame.sort_index
    DataFrame.nlargest
    DataFrame.nsmallest
+   DataFrame.swaplevel
    DataFrame.stack
    DataFrame.unstack
    DataFrame.melt
@@ -251,11 +252,17 @@ Serialization / IO / conversion
 .. autosummary::
    :toctree: api/
 
+   DataFrame.deserialize
+   DataFrame.device_deserialize
+   DataFrame.device_serialize
    DataFrame.from_arrow
    DataFrame.from_dict
    DataFrame.from_pandas
    DataFrame.from_records
    DataFrame.hash_values
+   DataFrame.host_deserialize
+   DataFrame.host_serialize
+   DataFrame.serialize
    DataFrame.to_arrow
    DataFrame.to_dict
    DataFrame.to_dlpack
@@ -270,3 +277,5 @@ Serialization / IO / conversion
    DataFrame.to_feather
    DataFrame.to_records
    DataFrame.to_string
+   DataFrame.values
+   DataFrame.values_host
diff --git a/docs/cudf/source/api_docs/extension_dtypes.rst b/docs/cudf/source/api_docs/extension_dtypes.rst
new file mode 100644
index 00000000000..b470df4aa00
--- /dev/null
+++ b/docs/cudf/source/api_docs/extension_dtypes.rst
@@ -0,0 +1,176 @@
+================
+Extension Dtypes
+================
+.. currentmodule:: cudf.core.dtypes
+
+cuDF supports a number of extension dtypes that build on top of the types that pandas supports. These dtypes are not directly available in pandas, which instead relies on object dtype arrays that run at Python rather than native speeds. The following dtypes are supported:
+
+
+cudf.CategoricalDtype
+=====================
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   CategoricalDtype
+
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+    CategoricalDtype.categories
+    CategoricalDtype.construct_from_string
+    CategoricalDtype.deserialize
+    CategoricalDtype.device_deserialize
+    CategoricalDtype.device_serialize
+    CategoricalDtype.from_pandas
+    CategoricalDtype.host_deserialize
+    CategoricalDtype.host_serialize
+    CategoricalDtype.is_dtype
+    CategoricalDtype.name
+    CategoricalDtype.ordered
+    CategoricalDtype.serialize
+    CategoricalDtype.str
+    CategoricalDtype.to_pandas
+    CategoricalDtype.type
+
+
+cudf.Decimal32Dtype
+===================
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   Decimal32Dtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   Decimal32Dtype.ITEMSIZE
+   Decimal32Dtype.MAX_PRECISION
+   Decimal32Dtype.deserialize
+   Decimal32Dtype.device_deserialize
+   Decimal32Dtype.device_serialize
+   Decimal32Dtype.from_arrow
+   Decimal32Dtype.host_deserialize
+   Decimal32Dtype.host_serialize
+   Decimal32Dtype.is_dtype
+   Decimal32Dtype.itemsize
+   Decimal32Dtype.precision
+   Decimal32Dtype.scale
+   Decimal32Dtype.serialize
+   Decimal32Dtype.str
+   Decimal32Dtype.to_arrow
+
+cudf.Decimal64Dtype
+===================
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   Decimal64Dtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   Decimal64Dtype.ITEMSIZE
+   Decimal64Dtype.MAX_PRECISION
+   Decimal64Dtype.deserialize
+   Decimal64Dtype.device_deserialize
+   Decimal64Dtype.device_serialize
+   Decimal64Dtype.from_arrow
+   Decimal64Dtype.host_deserialize
+   Decimal64Dtype.host_serialize
+   Decimal64Dtype.is_dtype
+   Decimal64Dtype.itemsize
+   Decimal64Dtype.precision
+   Decimal64Dtype.scale
+   Decimal64Dtype.serialize
+   Decimal64Dtype.str
+   Decimal64Dtype.to_arrow
+
+cudf.Decimal128Dtype
+====================
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   Decimal128Dtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   Decimal128Dtype.ITEMSIZE
+   Decimal128Dtype.MAX_PRECISION
+   Decimal128Dtype.deserialize
+   Decimal128Dtype.device_deserialize
+   Decimal128Dtype.device_serialize
+   Decimal128Dtype.from_arrow
+   Decimal128Dtype.host_deserialize
+   Decimal128Dtype.host_serialize
+   Decimal128Dtype.is_dtype
+   Decimal128Dtype.itemsize
+   Decimal128Dtype.precision
+   Decimal128Dtype.scale
+   Decimal128Dtype.serialize
+   Decimal128Dtype.str
+   Decimal128Dtype.to_arrow
+
+cudf.ListDtype
+==============
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   ListDtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   ListDtype.deserialize
+   ListDtype.device_deserialize
+   ListDtype.device_serialize
+   ListDtype.element_type
+   ListDtype.from_arrow
+   ListDtype.host_deserialize
+   ListDtype.host_serialize
+   ListDtype.is_dtype
+   ListDtype.leaf_type
+   ListDtype.serialize
+   ListDtype.to_arrow
+   ListDtype.type
+
+cudf.StructDtype
+================
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   StructDtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   StructDtype.deserialize
+   StructDtype.device_deserialize
+   StructDtype.device_serialize
+   StructDtype.fields
+   StructDtype.from_arrow
+   StructDtype.host_deserialize
+   StructDtype.host_serialize
+   StructDtype.is_dtype
+   StructDtype.serialize
+   StructDtype.to_arrow
+   StructDtype.type
diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst
index ef04167c327..01047a31462 100644
--- a/docs/cudf/source/api_docs/index.rst
+++ b/docs/cudf/source/api_docs/index.rst
@@ -22,3 +22,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     list_handling
     struct_handling
     options
+    extension_dtypes
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 9163440b23c..a6a23d189e9 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -21,9 +21,10 @@ Properties
 .. autosummary::
    :toctree: api/
 
+   Index.dtype
+   Index.duplicated
    Index.empty
    Index.has_duplicates
-   Index.duplicated
    Index.hasnans
    Index.is_monotonic
    Index.is_monotonic_increasing
@@ -52,7 +53,6 @@ Modifying and computations
    Index.is_floating
    Index.is_integer
    Index.is_interval
-   Index.is_mixed
    Index.is_numeric
    Index.is_object
    Index.min
@@ -93,6 +93,13 @@ Conversion
    :toctree: api/
 
    Index.astype
+   Index.deserialize
+   Index.device_deserialize
+   Index.device_serialize
+   Index.host_deserialize
+   Index.host_serialize
+   Index.serialize
+   Index.tolist
    Index.to_arrow
    Index.to_cupy
    Index.to_list
@@ -110,6 +117,7 @@ Sorting
    :toctree: api/
 
    Index.argsort
+   Index.find_label_range
    Index.searchsorted
    Index.sort_values
 
@@ -141,6 +149,13 @@ Selecting
    Index.get_slice_bound
    Index.isin
 
+String Operations
+~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.str
+
 .. _api.numericindex:
 
 Numeric Index
@@ -185,6 +200,7 @@ IntervalIndex
 -------------
 .. autosummary::
    :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
 
    IntervalIndex
 
diff --git a/docs/cudf/source/api_docs/io.rst b/docs/cudf/source/api_docs/io.rst
index a52667cd3e4..05c0cc82e62 100644
--- a/docs/cudf/source/api_docs/io.rst
+++ b/docs/cudf/source/api_docs/io.rst
@@ -39,6 +39,8 @@ Parquet
    :template: autosummary/class_with_autosummary.rst
 
    cudf.io.parquet.ParquetDatasetWriter
+   cudf.io.parquet.ParquetDatasetWriter.close
+   cudf.io.parquet.ParquetDatasetWriter.write_table
 
 
 ORC
diff --git a/docs/cudf/source/api_docs/list_handling.rst b/docs/cudf/source/api_docs/list_handling.rst
index f1fb6d1ca74..78980e5d56c 100644
--- a/docs/cudf/source/api_docs/list_handling.rst
+++ b/docs/cudf/source/api_docs/list_handling.rst
@@ -5,6 +5,12 @@ List handling
 lists and apply list methods to it. These can be accessed like
 ``Series.list.<function/property>``.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.list
+
 .. currentmodule:: cudf.core.column.lists.ListMethods
 .. autosummary::
    :toctree: api/
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 2193f0435a9..8bab649f079 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -46,7 +46,14 @@ Conversion
    Series.astype
    Series.convert_dtypes
    Series.copy
+   Series.deserialize
+   Series.device_deserialize
+   Series.device_serialize
+   Series.host_deserialize
+   Series.host_serialize
+   Series.serialize
    Series.to_list
+   Series.tolist
    Series.__array__
    Series.scale
 
@@ -193,6 +200,7 @@ Missing data handling
    Series.dropna
    Series.ffill
    Series.fillna
+   Series.interpolate
    Series.isna
    Series.isnull
    Series.nans_to_nulls
@@ -256,6 +264,12 @@ Datetimelike properties
 datetimelike and return several properties.
 These can be accessed like ``Series.dt.<property>``.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.dt
+
 Datetime properties
 ^^^^^^^^^^^^^^^^^^^
 .. currentmodule:: cudf.core.series.DatetimeProperties
@@ -324,6 +338,12 @@ Categorical accessor
 Categorical-dtype specific methods and attributes are available under
 the ``Series.cat`` accessor.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.cat
+
 .. currentmodule:: cudf.core.column.categorical.CategoricalAccessor
 .. autosummary::
    :toctree: api/
diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst
index cc85e9dba2b..ab0f085e1a6 100644
--- a/docs/cudf/source/api_docs/string_handling.rst
+++ b/docs/cudf/source/api_docs/string_handling.rst
@@ -5,6 +5,12 @@ String handling
 strings and apply several methods to it. These can be accessed like
 ``Series.str.<function/property>``.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.str
+
 .. currentmodule:: cudf.core.column.string.StringMethods
 .. autosummary::
    :toctree: api/
diff --git a/docs/cudf/source/api_docs/struct_handling.rst b/docs/cudf/source/api_docs/struct_handling.rst
index 05ba990382a..336aa732df8 100644
--- a/docs/cudf/source/api_docs/struct_handling.rst
+++ b/docs/cudf/source/api_docs/struct_handling.rst
@@ -5,6 +5,12 @@ Struct handling
 Structs and apply struct methods to it. These can be accessed like
 ``Series.struct.<function/property>``.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.struct
+
 .. currentmodule:: cudf.core.column.struct.StructMethods
 .. autosummary::
    :toctree: api/
diff --git a/docs/cudf/source/api_docs/subword_tokenize.rst b/docs/cudf/source/api_docs/subword_tokenize.rst
index fc814bcb92a..80d77ebcde2 100644
--- a/docs/cudf/source/api_docs/subword_tokenize.rst
+++ b/docs/cudf/source/api_docs/subword_tokenize.rst
@@ -10,3 +10,4 @@ Constructor
    :template: autosummary/class_with_autosummary.rst
 
    SubwordTokenizer
+   SubwordTokenizer.__call__
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 1654750bdf8..f9982c69e1b 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -204,10 +204,19 @@
     "numpy": ("https://numpy.org/doc/stable", None),
     "pyarrow": ("https://arrow.apache.org/docs/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
+    "typing_extensions": ("https://typing-extensions.readthedocs.io/en/stable/", None),
 }
 
 # Config numpydoc
-numpydoc_show_inherited_class_members = True
+numpydoc_show_inherited_class_members = {
+    "cudf.core.dtypes.CategoricalDtype": False,
+    "cudf.core.dtypes.Decimal32Dtype": False,
+    "cudf.core.dtypes.Decimal64Dtype": False,
+    "cudf.core.dtypes.Decimal128Dtype": False,
+    "cudf.core.dtypes.ListDtype": False,
+    "cudf.core.dtypes.StructDtype": False,
+    "cudf.option_context": False,
+}
 numpydoc_class_members_toctree = False
 numpydoc_attributes_as_param_list = False
 
@@ -267,7 +276,13 @@ def process_class_docstrings(app, what, name, obj, options, lines):
             lines[:] = lines[:cut_index]
 
 
-nitpick_ignore = [("py:class", "SeriesOrIndex"),]
+nitpick_ignore = [
+    ("py:class", "SeriesOrIndex"),
+    ("py:class", "Dtype"),
+    # TODO: Remove this when we figure out why typing_extensions doesn't seem
+    # to map types correctly for intersphinx
+    ("py:class", "typing_extensions.Self"),
+]
 
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 0352c624e04..c3da2558db8 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -5,12 +5,11 @@
    "id": "4c6c548b",
    "metadata": {},
    "source": [
-    "10 Minutes to cuDF and Dask-cuDF\n",
-    "=======================\n",
+    "# 10 Minutes to cuDF and Dask-cuDF\n",
     "\n",
     "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n",
     "\n",
-    "### What are these Libraries?\n",
+    "## What are these Libraries?\n",
     "\n",
     "[cuDF](https://github.com/rapidsai/cudf) is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API in the style of [pandas](https://pandas.pydata.org).\n",
     "\n",
@@ -19,7 +18,7 @@
     "[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
     "\n",
     "\n",
-    "### When to use cuDF and Dask-cuDF\n",
+    "## When to use cuDF and Dask-cuDF\n",
     "\n",
     "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF."
    ]
@@ -51,8 +50,7 @@
    "id": "eff5fc19",
    "metadata": {},
    "source": [
-    "Object Creation\n",
-    "---------------"
+    "## Object Creation"
    ]
   },
   {
@@ -574,8 +572,7 @@
    "id": "5820795f",
    "metadata": {},
    "source": [
-    "Viewing Data\n",
-    "-------------"
+    "## Viewing Data"
    ]
   },
   {
@@ -1002,10 +999,7 @@
    "id": "3302a647",
    "metadata": {},
    "source": [
-    "Selection\n",
-    "------------\n",
-    "\n",
-    "## Getting"
+    "## Selecting a Column"
    ]
   },
   {
@@ -1088,7 +1082,7 @@
    "id": "a5160dd1",
    "metadata": {},
    "source": [
-    "## Selection by Label"
+    "## Selecting Rows by Label"
    ]
   },
   {
@@ -1250,7 +1244,7 @@
    "id": "d8e07162",
    "metadata": {},
    "source": [
-    "## Selection by Position"
+    "## Selecting Rows by Position"
    ]
   },
   {
@@ -2316,15 +2310,6 @@
     "ds.fillna(999).head(n=3)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "22199029",
-   "metadata": {},
-   "source": [
-    "Operations\n",
-    "------------"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "d97605e6",
@@ -3643,8 +3628,7 @@
    "id": "e0915c46",
    "metadata": {},
    "source": [
-    "Time Series\n",
-    "------------"
+    "## Time Series"
    ]
   },
   {
@@ -3816,8 +3800,7 @@
    "id": "45f9408b",
    "metadata": {},
    "source": [
-    "Categoricals\n",
-    "------------"
+    "## Categoricals"
    ]
   },
   {
@@ -4076,21 +4059,12 @@
     "dgdf.grade.cat.codes.compute()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "f9d616e2",
-   "metadata": {},
-   "source": [
-    "Converting Data Representation\n",
-    "--------------------------------"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "1b391a0d",
    "metadata": {},
    "source": [
-    "## Pandas"
+    "## Converting to Pandas"
    ]
   },
   {
@@ -4413,7 +4387,7 @@
    "id": "a104294a",
    "metadata": {},
    "source": [
-    "## Numpy"
+    "## Converting to Numpy"
    ]
   },
   {
@@ -4561,7 +4535,7 @@
    "id": "b520acf7",
    "metadata": {},
    "source": [
-    "## Arrow"
+    "## Converting to Arrow"
    ]
   },
   {
@@ -4636,21 +4610,12 @@
     "ddf.head().to_arrow()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "f0734690",
-   "metadata": {},
-   "source": [
-    "Getting Data In/Out\n",
-    "------------------------"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "6f0251c6",
    "metadata": {},
    "source": [
-    "## CSV"
+    "## Reading/Writing CSV Files"
    ]
   },
   {
@@ -5142,7 +5107,7 @@
    "id": "763c555b",
    "metadata": {},
    "source": [
-    "## Parquet"
+    "## Reading/Writing Parquet Files"
    ]
   },
   {
@@ -5427,7 +5392,7 @@
    "id": "90a49967",
    "metadata": {},
    "source": [
-    "## ORC"
+    "## Reading/Writing ORC Files"
    ]
   },
   {
@@ -5694,8 +5659,7 @@
    "id": "c988553d",
    "metadata": {},
    "source": [
-    "Dask Performance Tips\n",
-    "--------------------------------\n",
+    "## Dask Performance Tips\n",
     "\n",
     "Like Apache Spark, Dask operations are [lazy](https://en.wikipedia.org/wiki/Lazy_evaluation). Instead of being executed immediately, most operations are added to a task graph and the actual evaluation is delayed until the result is needed.\n",
     "\n",
@@ -6171,7 +6135,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.13 ('cudf-dev')",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -6185,7 +6149,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.10.12"
   },
   "vscode": {
    "interpreter": {
diff --git a/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst b/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst
deleted file mode 100644
index 808c20e0750..00000000000
--- a/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-cudf.CategoricalDtype
-=====================
-
-.. currentmodule:: cudf
-
-.. autoclass:: CategoricalDtype
-   :members: categories, ordered, from_pandas, to_pandas
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      CategoricalDtype.categories
-      CategoricalDtype.ordered
-      CategoricalDtype.from_pandas
-      CategoricalDtype.to_pandas
diff --git a/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst
deleted file mode 100644
index cada8fd6cb6..00000000000
--- a/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-cudf.Decimal128Dtype
-===================
-
-.. currentmodule:: cudf
-
-.. autoclass:: Decimal128Dtype
-   :members: precision, scale, itemsize, to_arrow, from_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      Decimal128Dtype.precision
-      Decimal128Dtype.scale
-      Decimal128Dtype.itemsize
-      Decimal128Dtype.to_arrow
-      Decimal128Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst
deleted file mode 100644
index c4c65bb2d24..00000000000
--- a/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-cudf.Decimal32Dtype
-===================
-
-.. currentmodule:: cudf
-
-.. autoclass:: Decimal32Dtype
-   :members: precision, scale, itemsize, to_arrow, from_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      Decimal32Dtype.precision
-      Decimal32Dtype.scale
-      Decimal32Dtype.itemsize
-      Decimal32Dtype.to_arrow
-      Decimal32Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst
deleted file mode 100644
index 99305ade485..00000000000
--- a/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-cudf.Decimal64Dtype
-===================
-
-.. currentmodule:: cudf
-
-.. autoclass:: Decimal64Dtype
-   :members: precision, scale, itemsize, to_arrow, from_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      Decimal64Dtype.precision
-      Decimal64Dtype.scale
-      Decimal64Dtype.itemsize
-      Decimal64Dtype.to_arrow
-      Decimal64Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.ListDtype.rst b/docs/cudf/source/user_guide/cudf.ListDtype.rst
deleted file mode 100644
index a9b5000e657..00000000000
--- a/docs/cudf/source/user_guide/cudf.ListDtype.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-cudf.ListDtype
-==============
-
-.. currentmodule:: cudf
-
-.. autoclass:: ListDtype
-   :members: element_type, leaf_type, from_arrow, to_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      ListDtype.element_type
-      ListDtype.leaf_type
-      ListDtype.from_arrow
-      ListDtype.to_arrow
diff --git a/docs/cudf/source/user_guide/cudf.StructDtype.rst b/docs/cudf/source/user_guide/cudf.StructDtype.rst
deleted file mode 100644
index dd2a841dbe3..00000000000
--- a/docs/cudf/source/user_guide/cudf.StructDtype.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-cudf.StructDtype
-================
-
-.. currentmodule:: cudf
-
-.. autoclass:: StructDtype
-   :members: fields, from_arrow, to_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      StructDtype.fields
-      StructDtype.from_arrow
-      StructDtype.to_arrow
diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
index c98a4ddea23..c5b1210a2c7 100644
--- a/docs/cudf/source/user_guide/cupy-interop.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -35,7 +35,7 @@
    "id": "e7e64b1a",
    "metadata": {},
    "source": [
-    "### Converting a cuDF DataFrame to a CuPy Array\n",
+    "## Converting a cuDF DataFrame to a CuPy Array\n",
     "\n",
     "If we want to convert a cuDF DataFrame to a CuPy ndarray, There are multiple ways to do it:\n",
     "\n",
@@ -110,7 +110,7 @@
    "id": "0759ab29",
    "metadata": {},
    "source": [
-    "### Converting a cuDF Series to a CuPy Array"
+    "## Converting a cuDF Series to a CuPy Array"
    ]
   },
   {
@@ -258,7 +258,7 @@
    "id": "b353bded",
    "metadata": {},
    "source": [
-    "### Converting a CuPy Array to a cuDF DataFrame\n",
+    "## Converting a CuPy Array to a cuDF DataFrame\n",
     "\n",
     "We can also convert a CuPy ndarray to a cuDF DataFrame. Like before, there are multiple ways to do it:\n",
     "\n",
@@ -782,7 +782,7 @@
    "id": "395e2bba",
    "metadata": {},
    "source": [
-    "### Converting a CuPy Array to a cuDF Series\n",
+    "## Converting a CuPy Array to a cuDF Series\n",
     "\n",
     "To convert an array to a Series, we can directly pass the array to the `Series` constructor."
    ]
@@ -818,7 +818,7 @@
    "id": "7e159619",
    "metadata": {},
    "source": [
-    "### Interweaving CuDF and CuPy for Smooth PyData Workflows\n",
+    "## Interweaving CuDF and CuPy for Smooth PyData Workflows\n",
     "\n",
     "RAPIDS libraries and the entire GPU PyData ecosystem are developing quickly, but sometimes a one library may not have the functionality you need. One example of this might be taking the row-wise sum (or mean) of a Pandas DataFrame. cuDF's support for row-wise operations isn't mature, so you'd need to either transpose the DataFrame or write a UDF and explicitly calculate the sum across each row. Transposing could lead to hundreds of thousands of columns (which cuDF wouldn't perform well with) depending on your data's shape, and writing a UDF can be time intensive.\n",
     "\n",
@@ -1078,7 +1078,7 @@
    "id": "61bfb868",
    "metadata": {},
    "source": [
-    "### Converting a cuDF DataFrame to a CuPy Sparse Matrix\n",
+    "## Converting a cuDF DataFrame to a CuPy Sparse Matrix\n",
     "\n",
     "We can also convert a DataFrame or Series to a CuPy sparse matrix. We might want to do this if downstream processes expect CuPy sparse matrices as an input.\n",
     "\n",
@@ -1421,7 +1421,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md
index ee75457e87d..1f4cfbc7366 100644
--- a/docs/cudf/source/user_guide/data-types.md
+++ b/docs/cudf/source/user_guide/data-types.md
@@ -9,18 +9,18 @@ All data types in cuDF are [nullable](missing-data).
 
 <div class="special-table">
 
-| Kind of data         | Data type(s)                                                                                      |
-|----------------------|---------------------------------------------------------------------------------------------------|
-| Signed integer       | `'int8'`, `'int16'`, `'int32'`, `'int64'`                                                         |
-| Unsigned integer     | `'uint32'`, `'uint64'`                                                                            |
-| Floating-point       | `'float32'`, `'float64'`                                                                          |
-| Datetime             | `'datetime64[s]'`, `'datetime64[ms]'`, `'datetime64['us']`, `'datetime64[ns]'`                    |
-| Timedelta (duration) | `'timedelta[s]'`, `'timedelta[ms]'`, `'timedelta['us']`, `'timedelta[ns]'`                        |
-| Category             | {py:func}`cudf.CategoricalDtype`                                                                  |
-| String               | `'object'` or `'string'`                                                                          |
-| Decimal              | {py:func}`cudf.Decimal32Dtype`, {py:func}`cudf.Decimal64Dtype`, {py:func}`cudf.Decimal128Dtype`   |
-| List                 | {py:func}`cudf.ListDtype`                                                                         |
-| Struct               | {py:func}`cudf.StructDtype`                                                                       |
+| Kind of data         | Data type(s)                                                                                                                             |
+|----------------------|------------------------------------------------------------------------------------------------------------------------------------------|
+| Signed integer       | `'int8'`, `'int16'`, `'int32'`, `'int64'`                                                                                                |
+| Unsigned integer     | `'uint32'`, `'uint64'`                                                                                                                   |
+| Floating-point       | `'float32'`, `'float64'`                                                                                                                 |
+| Datetime             | `'datetime64[s]'`, `'datetime64[ms]'`, `'datetime64['us']`, `'datetime64[ns]'`                                                           |
+| Timedelta (duration) | `'timedelta[s]'`, `'timedelta[ms]'`, `'timedelta['us']`, `'timedelta[ns]'`                                                               |
+| Category             | {py:class}`~cudf.core.dtypes.CategoricalDtype`                                                                                           |
+| String               | `'object'` or `'string'`                                                                                                                 |
+| Decimal              | {py:class}`~cudf.core.dtypes.Decimal32Dtype`, {py:class}`~cudf.core.dtypes.Decimal64Dtype`, {py:class}`~cudf.core.dtypes.Decimal128Dtype`|
+| List                 | {py:class}`~cudf.core.dtypes.ListDtype`                                                                                                  |
+| Struct               | {py:class}`~cudf.core.dtypes.StructDtype`                                                                                                |
 
 </div>
 
@@ -60,9 +60,11 @@ cuDF does not support storing arbitrary Python objects.
 ## Decimal data types
 
 We provide special data types for working with decimal data, namely
-`Decimal32Dtype`, `Decimal64Dtype`, and `Decimal128Dtype`.  Use these
-data types when you need to store values with greater precision than
-allowed by floating-point representation.
+{py:class}`~cudf.core.dtypes.Decimal32Dtype`,
+{py:class}`~cudf.core.dtypes.Decimal64Dtype`, and
+{py:class}`~cudf.core.dtypes.Decimal128Dtype`.  Use these data types when you
+need to store values with greater precision than allowed by floating-point
+representation.
 
 Decimal data types in cuDF are based on fixed-point representation.  A
 decimal data type is composed of a _precision_ and a _scale_.  The
@@ -110,10 +112,11 @@ type:
 
 ## Nested data types (`List` and `Struct`)
 
-`ListDtype` and `StructDtype` are special data types in cuDF for
-working with list-like and dictionary-like data. These are referred to
-as "nested" data types, because they enable you to store a list of
-lists, or a struct of lists, or a struct of list of lists, etc.,
+{py:class}`~cudf.core.dtypes.ListDtype` and
+{py:class}`~cudf.core.dtypes.StructDtype` are special data types in cuDF for
+working with list-like and dictionary-like data. These are referred to as
+"nested" data types, because they enable you to store a list of lists, or a
+struct of lists, or a struct of list of lists, etc.,
 
 You can create lists and struct Series from existing Pandas Series of
 lists and dictionaries respectively:
diff --git a/docs/cudf/source/user_guide/groupby.md b/docs/cudf/source/user_guide/groupby.md
index 66b548727e1..53ff971a64f 100644
--- a/docs/cudf/source/user_guide/groupby.md
+++ b/docs/cudf/source/user_guide/groupby.md
@@ -1,6 +1,7 @@
 ---
-substitutions:
-  describe: '`describe`'
+myst:
+  substitutions:
+    describe: '`describe`'
 ---
 
 (basics-groupby)=
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 925ec4e6559..1e6d4b332d1 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -2186,7 +2186,7 @@
    "source": [
     "## GroupBy DataFrame UDFs\n",
     "\n",
-    "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n",
+    "We can also apply UDFs to grouped DataFrames using `apply_grouped`.\n",
     "\n",
     "First, we'll group our DataFrame based on column `b`, which is either True or False."
    ]
diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md
index fd3ea3f67c9..9e821fd8833 100644
--- a/docs/cudf/source/user_guide/pandas-comparison.md
+++ b/docs/cudf/source/user_guide/pandas-comparison.md
@@ -145,7 +145,7 @@ For example, `s.sum()` is not guaranteed to produce identical results
 to Pandas nor produce identical results from run to run, when `s` is a
 Series of floats.  If you need to compare floating point results, you
 should typically do so using the functions provided in the
-[`cudf.testing`](/api_docs/general_utilities#testing-functions)
+[`cudf.testing`](/api_docs/general_utilities)
 module, which allow you to compare values up to a desired precision.
 
 ## Column names
diff --git a/docs/dask_cudf/Makefile b/docs/dask_cudf/Makefile
index d0c3cbf1020..37721116bd9 100644
--- a/docs/dask_cudf/Makefile
+++ b/docs/dask_cudf/Makefile
@@ -3,8 +3,9 @@
 
 # You can set these variables from the command line, and also
 # from the environment for the first two.
-SPHINXOPTS    ?=
+SPHINXOPTS    ?= -n -v
 SPHINXBUILD   ?= sphinx-build
+SPHINXPROJ    = dask_cudf
 SOURCEDIR     = source
 BUILDDIR      = build
 
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index de2eae2c23e..981a8323138 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import builtins
 import pickle
 import warnings
 from functools import cached_property
@@ -18,7 +19,6 @@
     drop_nulls,
 )
 from cudf._lib.types import size_type_dtype
-from cudf._typing import DtypeObj
 from cudf.api.types import (
     is_bool_dtype,
     is_integer,
@@ -32,42 +32,10 @@
 from cudf.utils import ioutils
 from cudf.utils.dtypes import is_mixed_with_object_dtype
 
-_index_astype_docstring = """\
-Create an Index with values cast to dtypes.
-
-The class of a new Index is determined by dtype. When conversion is
-impossible, a ValueError exception is raised.
-
-Parameters
-----------
-dtype : :class:`numpy.dtype`
-    Use a :class:`numpy.dtype` to cast entire Index object to.
-copy : bool, default False
-    By default, astype always returns a newly allocated object.
-    If copy is set to False and internal requirements on dtype are
-    satisfied, the original data is used to create a new Index
-    or the original Index is returned.
-
-Returns
--------
-Index
-    Index with values cast to specified dtype.
-
-Examples
---------
->>> import cudf
->>> index = cudf.Index([1, 2, 3])
->>> index
-Int64Index([1, 2, 3], dtype='int64')
->>> index.astype('float64')
-Float64Index([1.0, 2.0, 3.0], dtype='float64')
-"""
-
 
 class BaseIndex(Serializable):
     """Base class for all cudf Index types."""
 
-    dtype: DtypeObj
     _accessors: Set[Any] = set()
     _data: ColumnAccessor
 
@@ -79,7 +47,7 @@ def _columns(self) -> Tuple[Any, ...]:
     def _values(self) -> ColumnBase:
         raise NotImplementedError
 
-    def copy(self, deep: bool = True) -> BaseIndex:
+    def copy(self, deep: bool = True) -> Self:
         raise NotImplementedError
 
     def __len__(self):
@@ -90,10 +58,130 @@ def size(self):
         # The size of an index is always its length irrespective of dimension.
         return len(self)
 
+    def astype(self, dtype, copy: bool = True):
+        """Create an Index with values cast to dtypes.
+
+        The class of a new Index is determined by dtype. When conversion is
+        impossible, a ValueError exception is raised.
+
+        Parameters
+        ----------
+        dtype : :class:`numpy.dtype`
+            Use a :class:`numpy.dtype` to cast entire Index object to.
+        copy : bool, default False
+            By default, astype always returns a newly allocated object.
+            If copy is set to False and internal requirements on dtype are
+            satisfied, the original data is used to create a new Index
+            or the original Index is returned.
+
+        Returns
+        -------
+        Index
+            Index with values cast to specified dtype.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> index = cudf.Index([1, 2, 3])
+        >>> index
+        Int64Index([1, 2, 3], dtype='int64')
+        >>> index.astype('float64')
+        Float64Index([1.0, 2.0, 3.0], dtype='float64')
+        """
+        raise NotImplementedError
+
+    def argsort(self, *args, **kwargs):
+        """Return the integer indices that would sort the index.
+
+        Parameters vary by subclass.
+        """
+        raise NotImplementedError
+
+    @property
+    def dtype(self):
+        raise NotImplementedError
+
+    @property
+    def empty(self):
+        return self.size == 0
+
+    @property
+    def is_unique(self):
+        """Return if the index has unique values."""
+        raise NotImplementedError
+
+    def memory_usage(self, deep=False):
+        """Return the memory usage of an object.
+
+        Parameters
+        ----------
+        deep : bool
+            The deep parameter is ignored and is only included for pandas
+            compatibility.
+
+        Returns
+        -------
+        The total bytes used.
+        """
+        raise NotImplementedError
+
+    def tolist(self):  # noqa: D102
+        raise TypeError(
+            "cuDF does not support conversion to host memory "
+            "via the `tolist()` method. Consider using "
+            "`.to_arrow().to_pylist()` to construct a Python list."
+        )
+
+    to_list = tolist
+
+    @property
+    def name(self):
+        """Returns the name of the Index."""
+        raise NotImplementedError
+
+    @property  # type: ignore
+    def ndim(self):  # noqa: D401
+        """Number of dimensions of the underlying data, by definition 1."""
+        return 1
+
+    def equals(self, other):
+        """
+        Determine if two Index objects contain the same elements.
+
+        Returns
+        -------
+        out: bool
+            True if "other" is an Index and it has the same elements
+            as calling index; False otherwise.
+        """
+        raise NotImplementedError
+
+    def shift(self, periods=1, freq=None):
+        """Not yet implemented"""
+        raise NotImplementedError
+
+    @property
+    def shape(self):
+        """Get a tuple representing the dimensionality of the data."""
+        return (len(self),)
+
+    @property
+    def str(self):
+        """Not yet implemented."""
+        raise NotImplementedError
+
     @property
     def values(self):
         raise NotImplementedError
 
+    def max(self):
+        """The maximum value of the index."""
+        raise NotImplementedError
+
+    def min(self):
+        """The minimum value of the index."""
+        raise NotImplementedError
+
     def get_loc(self, key, method=None, tolerance=None):
         raise NotImplementedError
 
@@ -341,6 +429,30 @@ def set_names(self, names, level=None, inplace=False):
     def has_duplicates(self):
         return not self.is_unique
 
+    def where(self, cond, other=None, inplace=False):
+        """
+        Replace values where the condition is False.
+
+        The replacement is taken from other.
+
+        Parameters
+        ----------
+        cond : bool array-like with the same length as self
+            Condition to select the values on.
+        other : scalar, or array-like, default None
+            Replacement if the condition is False.
+
+        Returns
+        -------
+        cudf.Index
+            A copy of self with values replaced from other
+            where the condition is False.
+        """
+        raise NotImplementedError
+
+    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
+        raise NotImplementedError
+
     def union(self, other, sort=None):
         """
         Form the union of two Index objects.
@@ -599,6 +711,18 @@ def to_frame(self, index=True, name=None):
             {col_name: self._values}, index=self if index else None
         )
 
+    def to_arrow(self):
+        """Convert to a suitable Arrow object."""
+        raise NotImplementedError
+
+    def to_cupy(self):
+        """Convert to a cupy array."""
+        raise NotImplementedError
+
+    def to_numpy(self):
+        """Convert to a numpy array."""
+        raise NotImplementedError
+
     def any(self):
         """
         Return whether any elements is True in Index.
@@ -610,7 +734,7 @@ def isna(self):
         Detect missing values.
 
         Return a boolean same-sized object indicating if the values are NA.
-        NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`cudf.NaN`, get
+        NA values, such as ``None``, `numpy.NAN` or `cudf.NA`, get
         mapped to ``True`` values.
         Everything else get mapped to ``False`` values.
 
@@ -628,7 +752,7 @@ def notna(self):
 
         Return a boolean same-sized object indicating if the values are not NA.
         Non-missing values get mapped to ``True``.
-        NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False``
+        NA values, such as None or `numpy.NAN`, get mapped to ``False``
         values.
 
         Returns
@@ -1502,9 +1626,9 @@ def find_label_range(self, loc: slice) -> slice:
     def searchsorted(
         self,
         value,
-        side: str = "left",
+        side: builtins.str = "left",
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: builtins.str = "last",
     ):
         """Find index where elements should be inserted to maintain order
 
@@ -1529,9 +1653,9 @@ def searchsorted(
         As a precondition the index must be sorted in the same order
         as requested by the `ascending` flag.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
-    def get_slice_bound(self, label, side: str, kind=None) -> int:
+    def get_slice_bound(self, label, side: builtins.str, kind=None) -> int:
         """
         Calculate slice bound that corresponds to given label.
         Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 0270351347d..1a7679e6336 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5252,18 +5252,16 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
         should not contain nulls.
 
         Edit distance is measured based on the `Levenshtein edit distance
-        algorithm
-        <https://www.cuelogic.com/blog/the-levenshtein-algorithm>`_.
-
+        algorithm <https://www.cuelogic.com/blog/the-levenshtein-algorithm>`_.
 
         Returns
         -------
         Series of ListDtype(int64)
-            Assume `N` is the length of this series. The return series contains
-            `N` lists of size `N`, where the `j`th number in the `i`th row of
-            the series tells the edit distance between the `i`th string and the
-            `j`th string of this series.
-            The matrix is symmetric. Diagonal elements are 0.
+            Assume ``N`` is the length of this series. The return series
+            contains ``N`` lists of size ``N``, where the ``j`` th number in
+            the ``i`` th row of the series tells the edit distance between the
+            ``i`` th string and the ``j`` th string of this series.  The matrix
+            is symmetric. Diagonal elements are 0.
 
         Examples
         --------
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d421258b06b..0298dd103f5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1019,6 +1019,12 @@ def deserialize(cls, header, frames):
 
         return obj
 
+    @property
+    @_cudf_nvtx_annotate
+    def shape(self):
+        """Returns a tuple representing the dimensionality of the DataFrame."""
+        return self._num_rows, self._num_columns
+
     @property
     def dtypes(self):
         """
@@ -2074,6 +2080,7 @@ def to_dict(
             - 'records' : list like
               [{column -> value}, ... , {column -> value}]
             - 'index' : dict like {index -> {column -> value}}
+
             Abbreviations are allowed. `s` indicates `series` and `sp`
             indicates `split`.
 
@@ -2478,6 +2485,7 @@ def reindex(
         We _highly_ recommend using keyword arguments to clarify your intent.
 
         Create a dataframe with some fictional data.
+
         >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
         >>> df = cudf.DataFrame({'http_status': [200, 200, 404, 404, 301],
         ...                    'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
@@ -2519,6 +2527,7 @@ def reindex(
         Chrome                 200           0.02
 
         We can also reindex the columns.
+
         >>> df.reindex(columns=['http_status', 'user_agent'])
                 http_status user_agent
         Firefox            200       <NA>
@@ -2528,6 +2537,7 @@ def reindex(
         Konqueror          301       <NA>
 
         Or we can use "axis-style" keyword arguments
+
         >>> df.reindex(columns=['http_status', 'user_agent'])
                 http_status user_agent
         Firefox            200       <NA>
@@ -4146,10 +4156,9 @@ def apply(
         func : function
             Function to apply to each row.
         axis : {0 or 'index', 1 or 'columns'}, default 0
-            Axis along which the function is applied:
-            * 0 or 'index': apply function to each column.
-              Note: axis=0 is not yet supported.
-            * 1 or 'columns': apply function to each row.
+            Axis along which the function is applied.
+            - 0 or 'index': apply function to each column (not yet supported).
+            - 1 or 'columns': apply function to each row.
         raw: bool, default False
             Not yet supported
         result_type: {'expand', 'reduce', 'broadcast', None}, default None
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index f6be7f8c2b9..4acdc2431f8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -651,17 +651,19 @@ def itemsize(self):
 
         Notes
         -----
-            When the scale is positive:
-                - numbers with fractional parts (e.g., 0.0042) can be represented
-                - the scale is the total number of digits to the right of the
-                decimal point
-            When the scale is negative:
-                - only multiples of powers of 10 (including 10**0) can be
-                represented (e.g., 1729, 4200, 1000000)
-                - the scale represents the number of trailing zeros in the value.
-            For example, 42 is representable with precision=2 and scale=0.
-            13.0051 is representable with precision=6 and scale=4,
-            and *not* representable with precision<6 or scale<4.
+        When the scale is positive:
+            - numbers with fractional parts (e.g., 0.0042) can be represented
+            - the scale is the total number of digits to the right of the
+              decimal point
+
+        When the scale is negative:
+            - only multiples of powers of 10 (including 10**0) can be
+              represented (e.g., 1729, 4200, 1000000)
+            - the scale represents the number of trailing zeros in the value.
+
+        For example, 42 is representable with precision=2 and scale=0.
+        13.0051 is representable with precision=6 and scale=4,
+        and *not* representable with precision<6 or scale<4.
 
         Examples
         --------
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 85f83953465..466a704c56e 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -19,7 +19,10 @@
     Union,
 )
 
+# TODO: The `numpy` import is needed for typing purposes during doc builds
+# only, need to figure out why the `np` alias is insufficient then remove.
 import cupy
+import numpy
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -246,74 +249,6 @@ def size(self):
         """
         return self._num_columns * self._num_rows
 
-    @property
-    @_cudf_nvtx_annotate
-    def shape(self):
-        """Returns a tuple representing the dimensionality of the DataFrame."""
-        return self._num_rows, self._num_columns
-
-    @property
-    @_cudf_nvtx_annotate
-    def empty(self):
-        """
-        Indicator whether DataFrame or Series is empty.
-
-        True if DataFrame/Series is entirely empty (no items),
-        meaning any of the axes are of length 0.
-
-        Returns
-        -------
-        out : bool
-            If DataFrame/Series is empty, return True, if not return False.
-
-        Notes
-        -----
-        If DataFrame/Series contains only `null` values, it is still not
-        considered empty. See the example below.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'A' : []})
-        >>> df
-        Empty DataFrame
-        Columns: [A]
-        Index: []
-        >>> df.empty
-        True
-
-        If we only have `null` values in our DataFrame, it is
-        not considered empty! We will need to drop
-        the `null`'s to make the DataFrame empty:
-
-        >>> df = cudf.DataFrame({'A' : [None, None]})
-        >>> df
-              A
-        0  <NA>
-        1  <NA>
-        >>> df.empty
-        False
-        >>> df.dropna().empty
-        True
-
-        Non-empty and empty Series example:
-
-        >>> s = cudf.Series([1, 2, None])
-        >>> s
-        0       1
-        1       2
-        2    <NA>
-        dtype: int64
-        >>> s.empty
-        False
-        >>> s = cudf.Series([])
-        >>> s
-        Series([], dtype: float64)
-        >>> s.empty
-        True
-        """
-        return self.size == 0
-
     @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
         """Return the memory usage of an object.
@@ -354,15 +289,16 @@ def astype(self, dtype, copy=False, **kwargs):
     def equals(self, other):
         """
         Test whether two objects contain the same elements.
-        This function allows two Series or DataFrames to be compared against
+
+        This function allows two objects to be compared against
         each other to see if they have the same shape and elements. NaNs in
         the same location are considered equal. The column headers do not
         need to have the same type.
 
         Parameters
         ----------
-        other : Series or DataFrame
-            The other Series or DataFrame to be compared with the first.
+        other : Index, Series, DataFrame
+            The other object to be compared with.
 
         Returns
         -------
@@ -576,7 +512,7 @@ def to_numpy(
         dtype: Union[Dtype, None] = None,
         copy: bool = True,
         na_value=None,
-    ) -> np.ndarray:
+    ) -> numpy.ndarray:
         """Convert the Frame to a NumPy array.
 
         Parameters
@@ -2642,10 +2578,6 @@ def head(self, n=5):
         DataFrame or Series
             The first `n` rows of the caller object.
 
-        See Also
-        --------
-        Frame.tail: Returns the last `n` rows.
-
         Examples
         --------
         **Series**
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6de6d770c17..618d2eb4553 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -37,7 +37,7 @@
     is_scalar,
     is_string_dtype,
 )
-from cudf.core._base_index import BaseIndex, _index_astype_docstring
+from cudf.core._base_index import BaseIndex
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
@@ -56,7 +56,7 @@
 from cudf.core.frame import Frame
 from cudf.core.mixins import BinaryOperand
 from cudf.core.single_column_frame import SingleColumnFrame
-from cudf.utils.docutils import copy_docstring, doc_apply
+from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     _maybe_convert_to_default_type,
     find_common_type,
@@ -220,9 +220,6 @@ def searchsorted(
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def name(self):
-        """
-        Returns the name of the Index.
-        """
         return self._name
 
     @name.setter  # type: ignore
@@ -377,7 +374,6 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
         )
 
     @_cudf_nvtx_annotate
-    @doc_apply(_index_astype_docstring)
     def astype(self, dtype, copy: bool = True):
         if is_dtype_equal(dtype, self.dtype):
             return self
@@ -492,9 +488,6 @@ def to_pandas(self, nullable=False):
 
     @property
     def is_unique(self):
-        """
-        Return if the index has unique values.
-        """
         return True
 
     @cached_property
@@ -1074,17 +1067,13 @@ def _concat(cls, objs):
     def memory_usage(self, deep=False):
         return self._column.memory_usage
 
+    @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def equals(self, other, **kwargs):
-        """
-        Determine if two Index objects contain the same elements.
+    def is_unique(self):
+        return self._column.is_unique
 
-        Returns
-        -------
-        out: bool
-            True if "other" is an Index and it has the same elements
-            as calling index; False otherwise.
-        """
+    @_cudf_nvtx_annotate
+    def equals(self, other):
         if (
             other is None
             or not isinstance(other, BaseIndex)
@@ -1165,7 +1154,6 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
         return _index_from_data({name: col.copy(True) if deep else col})
 
     @_cudf_nvtx_annotate
-    @doc_apply(_index_astype_docstring)
     def astype(self, dtype, copy: bool = True):
         return _index_from_data(super().astype({self.name: dtype}, copy))
 
@@ -1437,7 +1425,7 @@ def argsort(
         ascending=True,
         na_position="last",
     ):
-        """Return the integer indices that would sort the Series values.
+        """Return the integer indices that would sort the index.
 
         Parameters
         ----------
@@ -3267,9 +3255,9 @@ class Index(BaseIndex, metaclass=IndexMeta):
     Warnings
     --------
     This class should not be subclassed. It is designed as a factory for
-    different subclasses of :class:`BaseIndex` depending on the provided input.
+    different subclasses of `BaseIndex` depending on the provided input.
     If you absolutely must, and if you're intimately familiar with the
-    internals of cuDF, subclass :class:`BaseIndex` instead.
+    internals of cuDF, subclass `BaseIndex` instead.
 
     Examples
     --------
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5bd19b9f9c1..e6ac34f2290 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -448,6 +448,68 @@ def _check_data_index_length_match(self) -> None:
                 f"match length of index ({len(self._index)})"
             )
 
+    @property
+    @_cudf_nvtx_annotate
+    def empty(self):
+        """
+        Indicator whether DataFrame or Series is empty.
+
+        True if DataFrame/Series is entirely empty (no items),
+        meaning any of the axes are of length 0.
+
+        Returns
+        -------
+        out : bool
+            If DataFrame/Series is empty, return True, if not return False.
+
+        Notes
+        -----
+        If DataFrame/Series contains only `null` values, it is still not
+        considered empty. See the example below.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'A' : []})
+        >>> df
+        Empty DataFrame
+        Columns: [A]
+        Index: []
+        >>> df.empty
+        True
+
+        If we only have `null` values in our DataFrame, it is
+        not considered empty! We will need to drop
+        the `null`'s to make the DataFrame empty:
+
+        >>> df = cudf.DataFrame({'A' : [None, None]})
+        >>> df
+              A
+        0  <NA>
+        1  <NA>
+        >>> df.empty
+        False
+        >>> df.dropna().empty
+        True
+
+        Non-empty and empty Series example:
+
+        >>> s = cudf.Series([1, 2, None])
+        >>> s
+        0       1
+        1       2
+        2    <NA>
+        dtype: int64
+        >>> s.empty
+        False
+        >>> s = cudf.Series([])
+        >>> s
+        Series([], dtype: float64)
+        >>> s.empty
+        True
+        """
+        return self.size == 0
+
     def copy(self, deep: bool = True) -> Self:
         """Make a copy of this object's indices and data.
 
@@ -1491,6 +1553,7 @@ def sort_index(
         Examples
         --------
         **Series**
+
         >>> import cudf
         >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
         >>> series
@@ -1516,6 +1579,7 @@ def sort_index(
         dtype: object
 
         **DataFrame**
+
         >>> df = cudf.DataFrame(
         ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2])
         >>> df.sort_index(axis=0)
@@ -2242,6 +2306,7 @@ def add_suffix(self, suffix):
         Examples
         --------
         **Series**
+
         >>> s = cudf.Series([1, 2, 3, 4])
         >>> s
         0    1
@@ -2257,6 +2322,7 @@ def add_suffix(self, suffix):
         dtype: int64
 
         **DataFrame**
+
         >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
         >>> df
            A  B
@@ -2271,10 +2337,7 @@ def add_suffix(self, suffix):
         2       3       5
         3       4       6
         """
-        raise NotImplementedError(
-            "`IndexedFrame.add_suffix` not currently implemented. \
-                Use `Series.add_suffix` or `DataFrame.add_suffix`"
-        )
+        raise NotImplementedError
 
     @acquire_spill_lock()
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index c1685274198..3c0e29a96be 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -24,13 +24,7 @@
 from cudf.core import column
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
-from cudf.core.index import (
-    BaseIndex,
-    _index_astype_docstring,
-    _lexsorted_equal_range,
-    as_index,
-)
-from cudf.utils.docutils import doc_apply
+from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
 from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
 
 
@@ -199,7 +193,6 @@ def names(self, value):
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
     @_cudf_nvtx_annotate
-    @doc_apply(_index_astype_docstring)
     def astype(self, dtype, copy: bool = True):
         if not is_object_dtype(dtype):
             raise TypeError(
@@ -1530,6 +1523,10 @@ def from_pandas(cls, multiindex, nan_as_null=None):
     def is_unique(self):
         return len(self) == len(self.unique())
 
+    @property
+    def dtype(self):
+        return np.dtype("O")
+
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 02de3b8282a..6ff0584538e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -658,6 +658,17 @@ def from_pandas(cls, s, nan_as_null=None):
         """
         return cls(s, nan_as_null=nan_as_null)
 
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def is_unique(self):
+        """Return boolean if values in the object are unique.
+
+        Returns
+        -------
+        bool
+        """
+        return self._column.is_unique
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def dt(self):
@@ -820,6 +831,15 @@ def drop(
             labels, axis, index, columns, level, inplace, errors
         )
 
+    def tolist(self):  # noqa: D102
+        raise TypeError(
+            "cuDF does not support conversion to host memory "
+            "via the `tolist()` method. Consider using "
+            "`.to_arrow().to_pylist()` to construct a Python list."
+        )
+
+    to_list = tolist
+
     @_cudf_nvtx_annotate
     def to_dict(self, into: type[dict] = dict) -> dict:
         """
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 91a192e5942..0edad039444 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -7,7 +7,7 @@
 from typing import Any, Dict, Optional, Tuple, Union
 
 import cupy
-import numpy as np
+import numpy
 
 import cudf
 from cudf._typing import Dtype, NotImplementedType, ScalarLike
@@ -134,18 +134,9 @@ def to_numpy(
         dtype: Union[Dtype, None] = None,
         copy: bool = True,
         na_value=None,
-    ) -> np.ndarray:  # noqa: D102
+    ) -> numpy.ndarray:  # noqa: D102
         return super().to_numpy(dtype, copy, na_value).flatten()
 
-    def tolist(self):  # noqa: D102
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via the `tolist()` method. Consider using "
-            "`.to_arrow().to_pylist()` to construct a Python list."
-        )
-
-    to_list = tolist
-
     @classmethod
     @_cudf_nvtx_annotate
     def from_arrow(cls, array):
@@ -209,17 +200,6 @@ def to_arrow(self):
         """
         return self._column.to_arrow()
 
-    @property  # type: ignore
-    @_cudf_nvtx_annotate
-    def is_unique(self):
-        """Return boolean if values in the object are unique.
-
-        Returns
-        -------
-        bool
-        """
-        return self._column.is_unique
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic(self):

From 9e099cef25b11821c6307bb9c231656a2bae700f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 7 Aug 2023 18:06:01 -0500
Subject: [PATCH 034/230] Raise error when trying to construct time-zone aware
 timestamps (#13830)

Fixes: #13825 This PR raises an error when a time-zone-aware scalar is passed to binops or cudf scalar construction.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13830
---
 python/cudf/cudf/core/column/datetime.py  | 18 +++++++++++++-----
 python/cudf/cudf/core/column/timedelta.py | 16 +++++++++++++---
 python/cudf/cudf/tests/test_datetime.py   | 12 ++++++++++++
 python/cudf/cudf/tests/test_scalar.py     | 12 +++++++++++-
 python/cudf/cudf/tests/test_timedelta.py  | 11 +++++++++++
 python/cudf/cudf/utils/dtypes.py          | 17 ++++++++++++-----
 6 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 936beb289ec..84d283d3f22 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -242,14 +242,22 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)):
             return other
 
-        if isinstance(other, datetime.datetime):
-            other = np.datetime64(other)
-        elif isinstance(other, datetime.timedelta):
-            other = np.timedelta64(other)
-        elif isinstance(other, pd.Timestamp):
+        tz_error_msg = (
+            "Cannot perform binary operation on timezone-naive columns"
+            " and timezone-aware timestamps."
+        )
+        if isinstance(other, pd.Timestamp):
+            if other.tz is not None:
+                raise NotImplementedError(tz_error_msg)
             other = other.to_datetime64()
         elif isinstance(other, pd.Timedelta):
             other = other.to_timedelta64()
+        elif isinstance(other, datetime.datetime):
+            if other.tzinfo is not None:
+                raise NotImplementedError(tz_error_msg)
+            other = np.datetime64(other)
+        elif isinstance(other, datetime.timedelta):
+            other = np.timedelta64(other)
 
         if isinstance(other, np.datetime64):
             if np.isnat(other):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index f2c4c0fe481..272f6e20985 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -213,12 +213,22 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     def normalize_binop_value(self, other) -> ColumnBinaryOperand:
         if isinstance(other, (ColumnBase, cudf.Scalar)):
             return other
-        if isinstance(other, datetime.timedelta):
-            other = np.timedelta64(other)
-        elif isinstance(other, pd.Timestamp):
+
+        tz_error_msg = (
+            "Cannot perform binary operation on timezone-naive columns"
+            " and timezone-aware timestamps."
+        )
+        if isinstance(other, pd.Timestamp):
+            if other.tz is not None:
+                raise NotImplementedError(tz_error_msg)
             other = other.to_datetime64()
         elif isinstance(other, pd.Timedelta):
             other = other.to_timedelta64()
+        elif isinstance(other, datetime.timedelta):
+            other = np.timedelta64(other)
+        elif isinstance(other, datetime.datetime) and other.tzinfo is not None:
+            raise NotImplementedError(tz_error_msg)
+
         if isinstance(other, np.timedelta64):
             other_time_unit = cudf.utils.dtypes.get_time_unit(other)
             if np.isnat(other):
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index b0ef79b44e9..417df53c9c9 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2093,3 +2093,15 @@ def test_construction_from_tz_timestamps(data):
         _ = cudf.Series(data)
     with pytest.raises(NotImplementedError):
         _ = cudf.Index(data)
+
+
+@pytest.mark.parametrize("op", _cmpops)
+def test_datetime_binop_tz_timestamp(op):
+    s = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
+    pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc")
+    with pytest.raises(NotImplementedError):
+        op(s, pd_tz_timestamp)
+
+    date_scalar = datetime.datetime.now(datetime.timezone.utc)
+    with pytest.raises(NotImplementedError):
+        op(s, date_scalar)
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 5e1e58f9e68..c1aeb987eff 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import datetime
 import re
@@ -450,3 +450,13 @@ def test_scalar_numpy_casting():
     s1 = cudf.Scalar(1, dtype=np.int32)
     s2 = np.int64(2)
     assert s1 < s2
+
+
+def test_construct_timezone_scalar_error():
+    pd_scalar = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc")
+    with pytest.raises(NotImplementedError):
+        cudf.utils.dtypes.to_cudf_compatible_scalar(pd_scalar)
+
+    date_scalar = datetime.datetime.now(datetime.timezone.utc)
+    with pytest.raises(NotImplementedError):
+        cudf.utils.dtypes.to_cudf_compatible_scalar(date_scalar)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 4b1e8cf1027..ab45374c119 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1426,3 +1426,14 @@ def test_timedelta_constructor(data, dtype):
     actual = cudf.TimedeltaIndex(data=cudf.Series(data), dtype=dtype)
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("op", [operator.add, operator.sub])
+def test_timdelta_binop_tz_timestamp(op):
+    s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]")
+    pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc")
+    with pytest.raises(NotImplementedError):
+        op(s, pd_tz_timestamp)
+    date_tz_scalar = datetime.datetime.now(datetime.timezone.utc)
+    with pytest.raises(NotImplementedError):
+        op(s, date_tz_scalar)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index b8dc33345b1..d5e9e5854df 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -270,14 +270,21 @@ def to_cudf_compatible_scalar(val, dtype=None):
             # the string value directly (cudf.DeviceScalar will DTRT)
             return val
 
-    if isinstance(val, datetime.datetime):
-        val = np.datetime64(val)
-    elif isinstance(val, datetime.timedelta):
-        val = np.timedelta64(val)
-    elif isinstance(val, pd.Timestamp):
+    tz_error_msg = (
+        "Cannot covert a timezone-aware timestamp to timezone-naive scalar."
+    )
+    if isinstance(val, pd.Timestamp):
+        if val.tz is not None:
+            raise NotImplementedError(tz_error_msg)
         val = val.to_datetime64()
     elif isinstance(val, pd.Timedelta):
         val = val.to_timedelta64()
+    elif isinstance(val, datetime.datetime):
+        if val.tzinfo is not None:
+            raise NotImplementedError(tz_error_msg)
+        val = np.datetime64(val)
+    elif isinstance(val, datetime.timedelta):
+        val = np.timedelta64(val)
 
     val = _maybe_convert_to_default_type(
         cudf.api.types.pandas_dtype(type(val))

From cd3ddca848f3e888df6a0e4e2b949de641e8dd9a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 8 Aug 2023 10:23:12 -0700
Subject: [PATCH 035/230] Fix cuFile I/O factories (#13829)

Factories that instantiate `cufile_input_impl`/`cufile_output_impl` objects did not return the created objects, and instead always returned `nullptr`.
This PR fixes this bug, which made the "GDS" and "OFF" cuFile use policies effectively the same.
The default policy value, "KVIKIO", should be unaffected.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/13829
---
 cpp/src/io/utilities/file_io_utilities.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 470bd04fcc3..28eae8b8e97 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -288,8 +288,9 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
 {
   if (cufile_integration::is_gds_enabled()) {
     try {
-      auto const cufile_in = std::make_unique<cufile_input_impl>(filepath);
+      auto cufile_in = std::make_unique<cufile_input_impl>(filepath);
       CUDF_LOG_INFO("File successfully opened for reading with GDS.");
+      return cufile_in;
     } catch (...) {
       if (cufile_integration::is_always_enabled()) {
         CUDF_LOG_ERROR(
@@ -302,15 +303,16 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
         "buffer (possible performance impact).");
     }
   }
-  return nullptr;
+  return {};
 }
 
 std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath)
 {
   if (cufile_integration::is_gds_enabled()) {
     try {
-      auto const cufile_out = std::make_unique<cufile_output_impl>(filepath);
+      auto cufile_out = std::make_unique<cufile_output_impl>(filepath);
       CUDF_LOG_INFO("File successfully opened for writing with GDS.");
+      return cufile_out;
     } catch (...) {
       if (cufile_integration::is_always_enabled()) {
         CUDF_LOG_ERROR(
@@ -323,7 +325,7 @@ std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepa
         "buffer (possible performance impact).");
     }
   }
-  return nullptr;
+  return {};
 }
 
 std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size)

From 9b80bfdc71d68bb27646124f674aa2d15585fe97 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 8 Aug 2023 10:52:41 -0700
Subject: [PATCH 036/230] Simplify Python doc configuration (#13826)

This PR is a follow-up to #13789 that adds specified lists of methods/attributes to some classes; removes redundancy in the autosummary templates we are using and adds documentation explaining how they work; and removes various pieces of outdated code in our conf.py to make it easier to maintain going forward.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13826
---
 ...lass_without_autosummary.rst => class.rst} |   3 +
 .../autosummary/class_with_autosummary.rst    |  33 ----
 docs/cudf/source/api_docs/dataframe.rst       |   1 -
 .../cudf/source/api_docs/extension_dtypes.rst |   6 -
 docs/cudf/source/api_docs/index_objects.rst   |  14 +-
 docs/cudf/source/api_docs/io.rst              |   2 -
 docs/cudf/source/api_docs/series.rst          |   1 -
 .../cudf/source/api_docs/subword_tokenize.rst |   1 -
 docs/cudf/source/conf.py                      |  41 +----
 .../source/developer_guide/documentation.md   |  29 ++++
 python/cudf/cudf/core/dtypes.py               |  41 +++++
 python/cudf/cudf/core/index.py                | 148 +++++++++++++++++-
 python/cudf/cudf/core/multiindex.py           |  27 ++++
 13 files changed, 259 insertions(+), 88 deletions(-)
 rename docs/cudf/source/_templates/autosummary/{class_without_autosummary.rst => class.rst} (50%)
 delete mode 100644 docs/cudf/source/_templates/autosummary/class_with_autosummary.rst

diff --git a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst b/docs/cudf/source/_templates/autosummary/class.rst
similarity index 50%
rename from docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
rename to docs/cudf/source/_templates/autosummary/class.rst
index 6676c672b20..a16cd0d7305 100644
--- a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
+++ b/docs/cudf/source/_templates/autosummary/class.rst
@@ -4,3 +4,6 @@
 .. currentmodule:: {{ module }}
 
 .. autoclass:: {{ objname }}
+
+..
+   Don't include the methods or attributes sections, numpydoc adds them for us instead.
diff --git a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
deleted file mode 100644
index a9c9bd2b650..00000000000
--- a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-{% extends "!autosummary/class.rst" %}
-
-{% block methods %}
-{% if methods %}
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-      {% for item in all_methods %}
-      {%- if not item.startswith('_') or item in ['__call__'] %}
-      {{ name }}.{{ item }}
-      {%- endif -%}
-      {%- endfor %}
-
-{% endif %}
-{% endblock %}
-
-{% block attributes %}
-{% if attributes %}
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-      {% for item in all_attributes %}
-      {%- if not item.startswith('_') %}
-      {{ name }}.{{ item }}
-      {%- endif -%}
-      {%- endfor %}
-
-{% endif %}
-{% endblock %}
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index 642996f9379..a8f3edf5a04 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -7,7 +7,6 @@ Constructor
 ~~~~~~~~~~~
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_with_autosummary.rst
 
    DataFrame
 
diff --git a/docs/cudf/source/api_docs/extension_dtypes.rst b/docs/cudf/source/api_docs/extension_dtypes.rst
index b470df4aa00..daccb01b737 100644
--- a/docs/cudf/source/api_docs/extension_dtypes.rst
+++ b/docs/cudf/source/api_docs/extension_dtypes.rst
@@ -10,7 +10,6 @@ cudf.CategoricalDtype
 =====================
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    CategoricalDtype
 
@@ -41,7 +40,6 @@ cudf.Decimal32Dtype
 ===================
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    Decimal32Dtype
 
@@ -70,7 +68,6 @@ cudf.Decimal64Dtype
 ===================
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    Decimal64Dtype
 
@@ -99,7 +96,6 @@ cudf.Decimal128Dtype
 ====================
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    Decimal128Dtype
 
@@ -128,7 +124,6 @@ cudf.ListDtype
 ==============
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    ListDtype
 
@@ -154,7 +149,6 @@ cudf.StructDtype
 ================
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    StructDtype
 
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index a6a23d189e9..013eaf29a56 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -12,7 +12,6 @@ used before calling these methods directly.**
 
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_with_autosummary.rst
 
    Index
 
@@ -162,9 +161,13 @@ Numeric Index
 -------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    RangeIndex
+   RangeIndex.start
+   RangeIndex.stop
+   RangeIndex.step
+   RangeIndex.to_numpy
+   RangeIndex.to_arrow
    Int64Index
    UInt64Index
    Float64Index
@@ -175,7 +178,6 @@ CategoricalIndex
 ----------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    CategoricalIndex
 
@@ -200,7 +202,6 @@ IntervalIndex
 -------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    IntervalIndex
 
@@ -219,7 +220,6 @@ MultiIndex
 ----------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    MultiIndex
 
@@ -250,6 +250,7 @@ MultiIndex components
 
    MultiIndex.to_frame
    MultiIndex.droplevel
+   MultiIndex.swaplevel
 
 MultiIndex selecting
 ~~~~~~~~~~~~~~~~~~~~
@@ -265,7 +266,6 @@ DatetimeIndex
 -------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    DatetimeIndex
 
@@ -299,6 +299,7 @@ Time-specific operations
    DatetimeIndex.round
    DatetimeIndex.ceil
    DatetimeIndex.floor
+   DatetimeIndex.tz_convert
    DatetimeIndex.tz_localize
 
 Conversion
@@ -313,7 +314,6 @@ TimedeltaIndex
 --------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    TimedeltaIndex
 
diff --git a/docs/cudf/source/api_docs/io.rst b/docs/cudf/source/api_docs/io.rst
index 05c0cc82e62..417970715f8 100644
--- a/docs/cudf/source/api_docs/io.rst
+++ b/docs/cudf/source/api_docs/io.rst
@@ -36,8 +36,6 @@ Parquet
    read_parquet
    DataFrame.to_parquet
    cudf.io.parquet.read_parquet_metadata
-   :template: autosummary/class_with_autosummary.rst
-
    cudf.io.parquet.ParquetDatasetWriter
    cudf.io.parquet.ParquetDatasetWriter.close
    cudf.io.parquet.ParquetDatasetWriter.write_table
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 8bab649f079..ebfc1e3f5d1 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -7,7 +7,6 @@ Constructor
 -----------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_with_autosummary.rst
 
    Series
 
diff --git a/docs/cudf/source/api_docs/subword_tokenize.rst b/docs/cudf/source/api_docs/subword_tokenize.rst
index 80d77ebcde2..cd240fe4db4 100644
--- a/docs/cudf/source/api_docs/subword_tokenize.rst
+++ b/docs/cudf/source/api_docs/subword_tokenize.rst
@@ -7,7 +7,6 @@ Constructor
 ~~~~~~~~~~~
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_with_autosummary.rst
 
    SubwordTokenizer
    SubwordTokenizer.__call__
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index f9982c69e1b..03b1bb7039b 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -22,6 +22,7 @@
 from docutils.nodes import Text
 from sphinx.addnodes import pending_xref
 
+# -- Custom Extensions ----------------------------------------------------
 sys.path.append(os.path.abspath("./_ext"))
 
 # -- General configuration ------------------------------------------------
@@ -52,9 +53,6 @@
 
 copybutton_prompt_text = ">>> "
 autosummary_generate = True
-ipython_mplbackend = "str"
-
-html_use_modindex = True
 
 # Enable automatic generation of systematic, namespaced labels for sections
 myst_heading_anchors = 2
@@ -100,9 +98,6 @@
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
 
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
 html_theme_options = {
     "external_links": [],
     # https://github.com/pydata/pydata-sphinx-theme/issues/1220
@@ -209,14 +204,12 @@
 
 # Config numpydoc
 numpydoc_show_inherited_class_members = {
-    "cudf.core.dtypes.CategoricalDtype": False,
-    "cudf.core.dtypes.Decimal32Dtype": False,
-    "cudf.core.dtypes.Decimal64Dtype": False,
-    "cudf.core.dtypes.Decimal128Dtype": False,
-    "cudf.core.dtypes.ListDtype": False,
-    "cudf.core.dtypes.StructDtype": False,
+    # option_context inherits undocumented members from the parent class
     "cudf.option_context": False,
 }
+
+# Rely on toctrees generated from autosummary on each of the pages we define
+# rather than the autosummaries on the numpydoc auto-generated class pages.
 numpydoc_class_members_toctree = False
 numpydoc_attributes_as_param_list = False
 
@@ -229,8 +222,6 @@
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
 }
 
-_internal_names_to_ignore = {"cudf.core.column.string.StringColumn"}
-
 
 def resolve_aliases(app, doctree):
     pending_xrefs = doctree.traverse(condition=pending_xref)
@@ -254,26 +245,7 @@ def ignore_internal_references(app, env, node, contnode):
         # use `cudf.Index`
         node["reftarget"] = "cudf.Index"
         return contnode
-    elif name is not None and name in _internal_names_to_ignore:
-        node["reftarget"] = ""
-        return contnode
-
-
-def process_class_docstrings(app, what, name, obj, options, lines):
-    """
-    For those classes for which we use ::
-    :template: autosummary/class_without_autosummary.rst
-    the documented attributes/methods have to be listed in the class
-    docstring. However, if one of those lists is empty, we use 'None',
-    which then generates warnings in sphinx / ugly html output.
-    This "autodoc-process-docstring" event connector removes that part
-    from the processed docstring.
-    """
-    if what == "class":
-        if name in {"cudf.RangeIndex", "cudf.Int64Index", "cudf.UInt64Index", "cudf.Float64Index", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}:
-
-            cut_index = lines.index('.. rubric:: Attributes')
-            lines[:] = lines[:cut_index]
+    return None
 
 
 nitpick_ignore = [
@@ -289,4 +261,3 @@ def setup(app):
     app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
     app.connect("doctree-read", resolve_aliases)
     app.connect("missing-reference", ignore_internal_references)
-    app.connect("autodoc-process-docstring", process_class_docstrings)
diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index 187934cd274..26557de917a 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -121,6 +121,35 @@ while still matching the pandas layout as closely as possible.
 When adding a new API, developers simply have to add the API to the appropriate page.
 Adding the name of the function to the appropriate autosummary list is sufficient for it to be documented.
 
+### Documenting classes
+
+Python classes and the Sphinx plugins used in RAPIDS interact in nontrivial ways.
+`autosummary`'s default page generated for a class uses [`autodoc`](https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html) to automatically detect and document all methods of a class.
+That means that in addition to the manually created `autosummary` pages where class methods are grouped into sections of related features, there is another page for each class where all the methods of that class are automatically summarized in a table for quick access.
+However, we also use the [`numpydoc`](https://numpydoc.readthedocs.io/) extension, which offers the same feature.
+We use both in order to match the contents and style of the pandas documentation as closely as possible.
+
+pandas is also particular about what information is included in a class's documentation.
+While the documentation pages for the major user-facing classes like `DataFrame`, `Series`, and `Index` contain all APIs, less visible classes or subclasses (such as subclasses of `Index`) only include the methods that are specific to those subclasses.
+For example, {py:class}`cudf.CategoricalIndex` only includes `codes` and `categories` on its page, not the entire set of `Index` functionality.
+
+To accommodate these requirements, we take the following approach:
+1. The default `autosummary` template for classes is overridden with a [simpler template that does not generate method or attribute documentation](https://github.com/rapidsai/cudf/blob/main/docs/cudf/source/_templates/autosummary/class.rst). In other words, we disable `autosummary`'s generation of Methods and Attributes lists.
+2. We rely on `numpydoc` entirely for the classes that need their entire APIs listed (`DataFrame`/`Series`/etc). `numpydoc` will automatically populate Methods and Attributes section if (and only if) they are not already defined in the class's docstring.
+3. For classes that should only include a subset of APIs, we include those explicitly in the class's documentation. When those lists exist, `numpydoc` will not override them. If either the Methods or Attributes section should be empty, that section must still be included but should simply contain "None". For example, the class documentation for `CategoricalIndex` could include something like the following:
+
+```
+    Attributes
+    ----------
+    codes
+    categories
+
+    Methods
+    -------
+    None
+
+```
+
 ## Comparing to pandas
 
 cuDF aims to provide a pandas-like experience.
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 4acdc2431f8..a83c1f7b3c9 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -144,6 +144,16 @@ class CategoricalDtype(_BaseDtype):
         when used in operations that combine categoricals, e.g. astype, and
         will resolve to False if there is no existing ordered to maintain.
 
+    Attributes
+    ----------
+    categories
+    ordered
+
+    Methods
+    -------
+    from_pandas
+    to_pandas
+
     Examples
     --------
     >>> import cudf
@@ -320,6 +330,16 @@ class ListDtype(_BaseDtype):
     element_type : object
         A dtype with which represents the element types in the list.
 
+    Attributes
+    ----------
+    element_type
+    leaf_type
+
+    Methods
+    -------
+    from_arrow
+    to_arrow
+
     Examples
     --------
     >>> import cudf
@@ -496,6 +516,16 @@ class StructDtype(_BaseDtype):
         A mapping of field names to dtypes, the dtypes can themselves
         be of ``StructDtype`` too.
 
+    Attributes
+    ----------
+    fields
+    itemsize
+
+    Methods
+    -------
+    from_arrow
+    to_arrow
+
     Examples
     --------
     >>> import cudf
@@ -649,6 +679,17 @@ def itemsize(self):
         scale : int, optional
             The scale of the dtype. See Notes below.
 
+        Attributes
+        ----------
+        precision
+        scale
+        itemsize
+
+        Methods
+        -------
+        to_arrow
+        from_arrow
+
         Notes
         -----
         When the scale is positive:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 618d2eb4553..44a1620da8a 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -156,9 +156,16 @@ class RangeIndex(BaseIndex, BinaryOperand):
     copy : bool, default False
         Unused, accepted for homogeneity with other index types.
 
-    Returns
+    Attributes
+    ----------
+    start
+    stop
+    step
+
+    Methods
     -------
-    RangeIndex
+    to_numpy
+    to_arrow
 
     Examples
     --------
@@ -1637,6 +1644,14 @@ class Int8Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Int8Index
@@ -1662,6 +1677,14 @@ class Int16Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Int16Index
@@ -1687,6 +1710,14 @@ class Int32Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Int32Index
@@ -1712,6 +1743,14 @@ class Int64Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Int64Index
@@ -1737,6 +1776,14 @@ class UInt8Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     UInt8Index
@@ -1762,6 +1809,14 @@ class UInt16Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     UInt16Index
@@ -1787,6 +1842,14 @@ class UInt32Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     UInt32Index
@@ -1812,6 +1875,14 @@ class UInt64Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     UInt64Index
@@ -1837,6 +1908,14 @@ class Float32Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Float32Index
@@ -1868,6 +1947,14 @@ class Float64Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Float64Index
@@ -1908,6 +1995,32 @@ class DatetimeIndex(GenericIndex):
         If True parse dates in data with the year first order.
         This is not yet supported
 
+    Attributes
+    ----------
+    year
+    month
+    day
+    hour
+    minute
+    second
+    microsecond
+    nanosecond
+    date
+    time
+    dayofyear
+    day_of_year
+    weekday
+    quarter
+    freq
+
+    Methods
+    -------
+    ceil
+    floor
+    round
+    tz_convert
+    tz_localize
+
     Returns
     -------
     DatetimeIndex
@@ -2567,6 +2680,19 @@ class TimedeltaIndex(GenericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    days
+    seconds
+    microseconds
+    nanoseconds
+    components
+    inferred_freq
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     TimedeltaIndex
@@ -2711,6 +2837,15 @@ class CategoricalIndex(GenericIndex):
     name : object, optional
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    codes
+    categories
+
+    Methods
+    -------
+    equals
+
     Returns
     -------
     CategoricalIndex
@@ -2969,6 +3104,15 @@ class IntervalIndex(GenericIndex):
     name : object, optional
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    values
+
+    Methods
+    -------
+    from_breaks
+    get_loc
+
     Returns
     -------
     IntervalIndex
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 3c0e29a96be..9285a21f696 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -68,6 +68,33 @@ class MultiIndex(Frame, BaseIndex, NotIterable):
         Check that the levels/codes are consistent and valid.
         Not yet supported
 
+    Attributes
+    ----------
+    names
+    nlevels
+    dtypes
+    levels
+    codes
+
+    Methods
+    -------
+    from_arrays
+    from_tuples
+    from_product
+    from_frame
+    set_levels
+    set_codes
+    to_frame
+    to_flat_index
+    sortlevel
+    droplevel
+    swaplevel
+    reorder_levels
+    remove_unused_levels
+    get_level_values
+    get_loc
+    drop
+
     Returns
     -------
     MultiIndex

From 31909386983648ee1e6a97cfa64492e305341784 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 8 Aug 2023 18:53:41 -0500
Subject: [PATCH 037/230] Raise error for more cases when `timezone-aware` data
 is passed to `as_column` (#13835)

Fixes: #13834

This PR raises `NotImplementedError` for more types of data that are timezone-aware passed to `as_column`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13835
---
 python/cudf/cudf/api/types.py           | 16 +++++++++++-----
 python/cudf/cudf/core/column/column.py  | 19 +++++++++++++++++++
 python/cudf/cudf/tests/test_datetime.py |  2 ++
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index e94d3b504e8..a2afbde83eb 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -454,17 +454,23 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool:
 # TODO: Evaluate which of the datetime types need special handling for cudf.
 is_datetime_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype)
 is_datetime64_any_dtype = pd_types.is_datetime64_any_dtype
-is_datetime64_dtype = pd_types.is_datetime64_dtype
-is_datetime64_ns_dtype = pd_types.is_datetime64_ns_dtype
-is_datetime64tz_dtype = pd_types.is_datetime64tz_dtype
+is_datetime64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype)
+is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api(
+    pd_types.is_datetime64_ns_dtype
+)
+is_datetime64tz_dtype = _wrap_pandas_is_dtype_api(
+    pd_types.is_datetime64tz_dtype
+)
 is_extension_type = pd_types.is_extension_type
 is_extension_array_dtype = pd_types.is_extension_array_dtype
 is_int64_dtype = pd_types.is_int64_dtype
 is_period_dtype = pd_types.is_period_dtype
 is_signed_integer_dtype = pd_types.is_signed_integer_dtype
 is_timedelta_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype)
-is_timedelta64_dtype = pd_types.is_timedelta64_dtype
-is_timedelta64_ns_dtype = pd_types.is_timedelta64_ns_dtype
+is_timedelta64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype)
+is_timedelta64_ns_dtype = _wrap_pandas_is_dtype_api(
+    pd_types.is_timedelta64_ns_dtype
+)
 is_unsigned_integer_dtype = pd_types.is_unsigned_integer_dtype
 is_sparse = pd_types.is_sparse
 # is_list_like = pd_types.is_list_like
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 57f6c80fb05..53dbb9c50cc 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -53,6 +53,7 @@
     infer_dtype,
     is_bool_dtype,
     is_categorical_dtype,
+    is_datetime64_dtype,
     is_datetime64tz_dtype,
     is_decimal32_dtype,
     is_decimal64_dtype,
@@ -2230,6 +2231,12 @@ def as_column(
         data = ColumnBase.from_scalar(arbitrary, length if length else 1)
     elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
         data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype)
+    elif isinstance(arbitrary, pd.DatetimeIndex) and isinstance(
+        arbitrary.dtype, pd.DatetimeTZDtype
+    ):
+        raise NotImplementedError(
+            "cuDF does not yet support timezone-aware datetimes"
+        )
     else:
         try:
             data = as_column(
@@ -2279,6 +2286,18 @@ def as_column(
                             "Use `tz_localize()` to construct "
                             "timezone aware data."
                         )
+                    elif is_datetime64_dtype(dtype):
+                        # Error checking only, actual construction happens
+                        # below.
+                        pa_array = pa.array(arbitrary)
+                        if (
+                            isinstance(pa_array.type, pa.TimestampType)
+                            and pa_array.type.tz is not None
+                        ):
+                            raise NotImplementedError(
+                                "cuDF does not yet support timezone-aware "
+                                "datetimes"
+                            )
                     if is_list_dtype(dtype):
                         data = pa.array(arbitrary)
                         if type(data) not in (pa.ListArray, pa.NullArray):
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 417df53c9c9..dcb8781e712 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2093,6 +2093,8 @@ def test_construction_from_tz_timestamps(data):
         _ = cudf.Series(data)
     with pytest.raises(NotImplementedError):
         _ = cudf.Index(data)
+    with pytest.raises(NotImplementedError):
+        _ = cudf.DatetimeIndex(data)
 
 
 @pytest.mark.parametrize("op", _cmpops)

From ba6ff60aeaed59828770bd36a7026ad79776f30e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 8 Aug 2023 19:55:44 -0500
Subject: [PATCH 038/230] Return a Series from JIT GroupBy apply, rather than a
 DataFrame (#13820)

Closes https://github.com/rapidsai/cudf/issues/13809

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13820
---
 python/cudf/cudf/core/groupby/groupby.py | 13 +++++-----
 python/cudf/cudf/tests/test_groupby.py   | 31 +++++++++++++-----------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 2519fda326a..2ed9bed5b49 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1204,8 +1204,7 @@ def _jit_groupby_apply(
             {None: chunk_results}, index=group_names
         )
         result.index.names = self.grouping.names
-        result = result.reset_index()
-        result[None] = result.pop(0)
+
         return result
 
     @_cudf_nvtx_annotate
@@ -1364,10 +1363,12 @@ def mult(df):
         ...   lambda group: group['b'].max() - group['b'].min(),
         ...   engine='jit'
         ... )
-           a  None
-        0  1     1
-        1  2     1
-        2  3     1
+        a
+        1    1
+        2    1
+        3    1
+        dtype: int64
+
         """
 
         if self.obj.empty:
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 7d22cb70803..b01b44da201 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -395,7 +395,7 @@ def groupby_jit_data():
 
 
 def run_groupby_apply_jit_test(data, func, keys, *args):
-    expect_groupby_obj = data.to_pandas().groupby(keys, as_index=False)
+    expect_groupby_obj = data.to_pandas().groupby(keys)
     got_groupby_obj = data.groupby(keys)
 
     # compare cuDF jit to pandas
@@ -475,7 +475,19 @@ def func(df):
 
 @pytest.mark.parametrize("dtype", ["float64"])
 @pytest.mark.parametrize("func", ["idxmax", "idxmin"])
-@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
+@pytest.mark.parametrize(
+    "special_val",
+    [
+        pytest.param(
+            np.nan,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/13832"
+            ),
+        ),
+        np.inf,
+        -np.inf,
+    ],
+)
 def test_groupby_apply_jit_idx_reductions_special_vals(
     func, groupby_jit_data, dtype, special_val
 ):
@@ -494,19 +506,10 @@ def func(df):
     groupby_jit_data["val1"] = special_val
     groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype)
 
-    expect = (
-        groupby_jit_data.to_pandas()
-        .groupby("key1", as_index=False)
-        .apply(func)
-    )
-
-    grouped = groupby_jit_data.groupby("key1")
-    sorted = grouped._grouped()[3].to_pandas()
-    expect_vals = sorted["key1"].drop_duplicates().index
-    expect[None] = expect_vals
+    expect = groupby_jit_data.to_pandas().groupby("key1").apply(func)
+    got = groupby_jit_data.groupby("key1").apply(func, engine="jit")
 
-    got = grouped.apply(func, engine="jit")
-    assert_eq(expect, got)
+    assert_eq(expect, got, check_dtype=False)
 
 
 @pytest.mark.parametrize(

From edb25a84aaafa0d65d36ebb94113393d4b6474fb Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 8 Aug 2023 20:52:11 -0500
Subject: [PATCH 039/230] Fix `any`, `all` reduction behavior for `axis=None`
 and warn for other reductions (#13831)

Fixes: #13827

This PR:

- [x] Fixes `axis=None` behavior for `any` & `all` reductions.
- [x] Introduces `FutureWarning` for upcoming change in behavior for the rest of the reductions since some of the reductions are only updated in pandas-2.0 and the rest would be updated in pandas-3.0.
- [x] Fixed numpy array function inconsistency because of mismatching default for `axis` parameter.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13831
---
 python/cudf/cudf/core/dataframe.py            | 48 +++++++++++---
 python/cudf/cudf/core/frame.py                | 64 ++++++++++++++-----
 python/cudf/cudf/core/indexed_frame.py        |  2 +-
 python/cudf/cudf/core/single_column_frame.py  |  6 +-
 python/cudf/cudf/tests/test_array_function.py |  5 ++
 python/cudf/cudf/tests/test_reductions.py     | 60 ++++++++++++++++-
 6 files changed, 156 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0298dd103f5..dabef4adde0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1333,6 +1333,11 @@ def __array_function__(self, func, types, args, kwargs):
             return NotImplemented
 
         try:
+            if func.__name__ in {"any", "all"}:
+                # NumPy default for `axis` is
+                # different from `cudf`/`pandas`
+                # hence need this special handling.
+                kwargs.setdefault("axis", None)
             if cudf_func := getattr(self.__class__, func.__name__, None):
                 out = cudf_func(*args, **kwargs)
                 # The dot product of two DataFrames returns an array in pandas.
@@ -2557,7 +2562,7 @@ def reindex(
                 "Cannot specify both 'axis' and any of 'index' or 'columns'."
             )
 
-        axis = self._get_axis_from_axis_arg(axis)
+        axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
         if axis == 0:
             if index is None:
                 index = labels
@@ -5798,7 +5803,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
     _SUPPORT_AXIS_LOOKUP = {
         0: 0,
         1: 1,
-        None: 0,
         "index": 0,
         "columns": 1,
     }
@@ -5826,9 +5830,26 @@ def _reduce(
             if source.empty:
                 return Series(index=cudf.Index([], dtype="str"))
 
-        axis = source._get_axis_from_axis_arg(axis)
+        if axis is None:
+            if op in {"any", "all"}:
+                axis = 2
+            else:
+                # Do not remove until pandas 2.0 support is added.
+                warnings.warn(
+                    f"In a future version, {type(self).__name__}"
+                    f".{op}(axis=None) will return a scalar {op} over "
+                    "the entire DataFrame. To retain the old behavior, "
+                    f"use '{type(self).__name__}.{op}(axis=0)' or "
+                    f"just '{type(self)}.{op}()'",
+                    FutureWarning,
+                )
+                axis = 0
+        elif axis is no_default:
+            axis = 0
+        else:
+            axis = source._get_axis_from_axis_arg(axis)
 
-        if axis == 0:
+        if axis in {0, 2}:
             try:
                 result = [
                     getattr(source._data[col], op)(**kwargs)
@@ -5867,7 +5888,10 @@ def _reduce(
                     )
                     source = self._get_columns_by_label(numeric_cols)
                     if source.empty:
-                        return Series(index=cudf.Index([], dtype="str"))
+                        if axis == 2:
+                            return getattr(as_column([]), op)(**kwargs)
+                        else:
+                            return Series(index=cudf.Index([], dtype="str"))
                     try:
                         result = [
                             getattr(source._data[col], op)(**kwargs)
@@ -5879,12 +5903,16 @@ def _reduce(
                         )
                 else:
                     raise
-
-            return Series._from_data(
-                {None: result}, as_index(source._data.names)
-            )
+            if axis == 2:
+                return getattr(as_column(result), op)(**kwargs)
+            else:
+                return Series._from_data(
+                    {None: result}, as_index(source._data.names)
+                )
         elif axis == 1:
             return source._apply_cupy_method_axis_1(op, **kwargs)
+        else:
+            raise ValueError(f"Invalid value of {axis=} received for {op}")
 
     @_cudf_nvtx_annotate
     def _scan(
@@ -5894,6 +5922,8 @@ def _scan(
         *args,
         **kwargs,
     ):
+        if axis is None:
+            axis = 0
         axis = self._get_axis_from_axis_arg(axis)
 
         if axis == 0:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 466a704c56e..69757fe900d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -31,6 +31,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import Dtype
+from cudf.api.extensions import no_default
 from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
@@ -1885,7 +1886,7 @@ def _reduce(self, *args, **kwargs):
     @_cudf_nvtx_annotate
     def min(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         numeric_only=None,
@@ -1936,7 +1937,7 @@ def min(
     @_cudf_nvtx_annotate
     def max(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         numeric_only=None,
@@ -1987,7 +1988,7 @@ def max(
     @_cudf_nvtx_annotate
     def sum(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         dtype=None,
         level=None,
@@ -2045,7 +2046,7 @@ def sum(
     @_cudf_nvtx_annotate
     def product(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         dtype=None,
         level=None,
@@ -2089,11 +2090,11 @@ def product(
         b    5040
         dtype: int64
         """
-        axis = self._get_axis_from_axis_arg(axis)
+
         return self._reduce(
             # cuDF columns use "product" as the op name, but cupy uses "prod"
             # and we need cupy if axis == 1.
-            "product" if axis == 0 else "prod",
+            "prod" if axis in {1, "columns"} else "product",
             axis=axis,
             skipna=skipna,
             dtype=dtype,
@@ -2108,7 +2109,12 @@ def product(
 
     @_cudf_nvtx_annotate
     def mean(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return the mean of the values for the requested axis.
@@ -2154,7 +2160,7 @@ def mean(
     @_cudf_nvtx_annotate
     def std(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         ddof=1,
@@ -2210,7 +2216,7 @@ def std(
     @_cudf_nvtx_annotate
     def var(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         ddof=1,
@@ -2264,7 +2270,12 @@ def var(
 
     @_cudf_nvtx_annotate
     def kurtosis(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return Fisher's unbiased kurtosis of a sample.
@@ -2305,7 +2316,7 @@ def kurtosis(
         b   -1.2
         dtype: float64
         """
-        if axis not in (0, "index", None):
+        if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
 
         return self._reduce(
@@ -2322,7 +2333,12 @@ def kurtosis(
 
     @_cudf_nvtx_annotate
     def skew(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return unbiased Fisher-Pearson skew of a sample.
@@ -2366,7 +2382,7 @@ def skew(
         b   -0.37037
         dtype: float64
         """
-        if axis not in (0, "index", None):
+        if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
 
         return self._reduce(
@@ -2385,6 +2401,15 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
 
         Parameters
         ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series`
+            this parameter is unused and defaults to `0`.
+
+            - 0 or 'index' : reduce the index, return a Series
+                whose index is the original column labels.
+            - 1 or 'columns' : reduce the columns, return a Series
+                whose index is the original index.
+            - None : reduce all axes, return a scalar.
         skipna: bool, default True
             Exclude NA/null values. If the entire row/column is NA and
             skipna is True, then the result will be True, as for an
@@ -2398,7 +2423,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
 
         Notes
         -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
+        Parameters currently not supported are `bool_only`, `level`.
 
         Examples
         --------
@@ -2424,6 +2449,15 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
 
         Parameters
         ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series`
+            this parameter is unused and defaults to `0`.
+
+            - 0 or 'index' : reduce the index, return a Series
+                whose index is the original column labels.
+            - 1 or 'columns' : reduce the columns, return a Series
+                whose index is the original index.
+            - None : reduce all axes, return a scalar.
         skipna: bool, default True
             Exclude NA/null values. If the entire row/column is NA and
             skipna is True, then the result will be False, as for an
@@ -2437,7 +2471,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
 
         Notes
         -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
+        Parameters currently not supported are `bool_only`, `level`.
 
         Examples
         --------
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e6ac34f2290..51a2d085d00 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3421,7 +3421,7 @@ def sample(
         0  1  3
         1  2  4
         """
-        axis = self._get_axis_from_axis_arg(axis)
+        axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
         size = self.shape[axis]
 
         # Compute `n` from parameter `frac`.
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 0edad039444..ffb432ed14a 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -11,6 +11,7 @@
 
 import cudf
 from cudf._typing import Dtype, NotImplementedType, ScalarLike
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -30,7 +31,6 @@ class SingleColumnFrame(Frame, NotIterable):
 
     _SUPPORT_AXIS_LOOKUP = {
         0: 0,
-        None: 0,
         "index": 0,
     }
 
@@ -38,12 +38,12 @@ class SingleColumnFrame(Frame, NotIterable):
     def _reduce(
         self,
         op,
-        axis=None,
+        axis=no_default,
         level=None,
         numeric_only=None,
         **kwargs,
     ):
-        if axis not in (None, 0):
+        if axis not in (None, 0, no_default):
             raise NotImplementedError("axis parameter is not implemented yet")
 
         if level is not None:
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index a355ebb40b2..758a8cbb535 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -67,6 +67,11 @@ def test_array_func_cudf_series(np_ar, func):
         lambda x: np.sum(x, axis=0),
         lambda x: np.var(x, ddof=1),
         lambda x: np.dot(x, x.transpose()),
+        lambda x: np.all(x),
+        lambda x: np.any(x),
+        lambda x: np.product(x),
+        lambda x: np.product(x, axis=0),
+        lambda x: np.product(x, axis=1),
     ],
 )
 def test_array_func_cudf_dataframe(pd_df, func):
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index c549ac20f59..47968ec1d97 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -12,7 +12,12 @@
 from cudf import Series
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import _utils as utils
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand
+from cudf.testing._utils import (
+    NUMERIC_TYPES,
+    assert_eq,
+    expect_warning_if,
+    gen_rand,
+)
 
 params_dtype = NUMERIC_TYPES
 
@@ -306,3 +311,56 @@ def test_categorical_reductions(op):
     psr = gsr.to_pandas()
 
     utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": [10, 11, 12]},
+        {"a": [1, 0, 3], "b": [10, 11, 12]},
+        {"a": [1, 2, 3], "b": [10, 11, None]},
+        {
+            "a": [],
+        },
+        {},
+    ],
+)
+@pytest.mark.parametrize("op", ["all", "any"])
+def test_any_all_axis_none(data, op):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expected = getattr(pdf, op)(axis=None)
+    actual = getattr(gdf, op)(axis=None)
+
+    assert expected == actual
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "product",
+        "std",
+        "var",
+        "kurt",
+        "kurtosis",
+        "skew",
+        "min",
+        "max",
+        "mean",
+        "median",
+    ],
+)
+def test_reductions_axis_none_warning(op):
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]})
+    pdf = df.to_pandas()
+    with pytest.warns(FutureWarning):
+        actual = getattr(df, op)(axis=None)
+    with expect_warning_if(
+        op in {"kurt", "kurtosis", "skew", "min", "max", "mean", "median"},
+        FutureWarning,
+    ):
+        expected = getattr(pdf, op)(axis=None)
+
+    assert_eq(expected, actual, check_dtype=False)

From da6ac73e8849a6d5d7471f8aad60a8fd1141fe22 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 8 Aug 2023 22:55:25 -1000
Subject: [PATCH 040/230] DataFrame with namedtuples uses ._field as column
 names (#13824)

Allow namedtuple's `_field` attribute to be mapped to DataFrame column labels like pandas

closes #13823

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13824
---
 python/cudf/cudf/core/dataframe.py       |  9 +++++++++
 python/cudf/cudf/tests/test_dataframe.py | 20 +++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index dabef4adde0..e4b944a88af 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -848,6 +848,15 @@ def _init_from_list_like(self, data, index=None, columns=None):
                 for col in data
             ):
                 raise TypeError("Inputs should be an iterable or sequence.")
+            if (
+                len(data) > 0
+                and columns is None
+                and isinstance(data[0], tuple)
+                and hasattr(data[0], "_fields")
+            ):
+                # pandas behavior is to use the fields from the first
+                # namedtuple as the column names
+                columns = data[0]._fields
 
             data = list(itertools.zip_longest(*data))
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 0898cb2ef3d..97e399a9cd5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10,7 +10,7 @@
 import string
 import textwrap
 import warnings
-from collections import OrderedDict, defaultdict
+from collections import OrderedDict, defaultdict, namedtuple
 from copy import copy
 
 import cupy
@@ -10261,3 +10261,21 @@ def __getitem__(self, key):
 
     with pytest.raises(TypeError):
         cudf.DataFrame({"a": A()})
+
+
+def test_dataframe_constructor_from_namedtuple():
+    Point1 = namedtuple("Point1", ["a", "b", "c"])
+    Point2 = namedtuple("Point1", ["x", "y"])
+
+    data = [Point1(1, 2, 3), Point2(4, 5)]
+    idx = ["a", "b"]
+    gdf = cudf.DataFrame(data, index=idx)
+    pdf = pd.DataFrame(data, index=idx)
+
+    assert_eq(gdf, pdf)
+
+    data = [Point2(4, 5), Point1(1, 2, 3)]
+    with pytest.raises(ValueError):
+        cudf.DataFrame(data, index=idx)
+    with pytest.raises(ValueError):
+        pd.DataFrame(data, index=idx)

From e8df03754021e3decfc6640b58bd7a0770b0c230 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 9 Aug 2023 21:17:17 +0200
Subject: [PATCH 041/230] Refactors JSON reader's pushdown automaton (#13716)

This PR simplifies and cleans up the JSON reader's pushdown automaton.

The pushdown automaton takes as input two arrays:
1. The JSON's input characters
2. The stack context for each character (`{` - `JSON object`, `[` - `JSON array`, `_` - `Root of JSON`)

Previously, we were fusing the two arrays and materializing them straight to the symbol group id for each combination. A symbol group id serves as the column of the transition table. The symbol group ids array was then used as input to the finite state transducer (FST).

After the [recent refactor of the FST](https://github.com/rapidsai/cudf/pull/13344) lookup tables, the FST has become more flexible. It now supports arbitrary iterators and the symbol group id lookup table (that maps a symbol to a symbol group id) can now be implemented by a simple function object.

This PR takes advantage of the FST's ability to take fancy iterators. We now zip the `json_input` and `stack_context` symbols and pass that `zip_iterator` to the FST.

Authors:
  - Elias Stehle (https://github.com/elstehle)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/13716
---
 cpp/src/io/fst/lookup_tables.cuh   | 70 +++++++++++++++++++++++++++++-
 cpp/src/io/json/nested_json_gpu.cu | 43 +++++++-----------
 2 files changed, 85 insertions(+), 28 deletions(-)

diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index c4176d5673f..37c99453361 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -179,6 +179,74 @@ class SingleSymbolSmemLUT {
   }
 };
 
+/**
+ * @brief A simple symbol group lookup wrapper that uses a simple function object to
+ * retrieve the symbol group id for a symbol.
+ *
+ * @tparam SymbolGroupLookupOpT The function object type to return the symbol group for a given
+ * symbol
+ */
+template <typename SymbolGroupLookupOpT>
+class SymbolGroupLookupOp {
+ private:
+  struct _TempStorage {};
+
+ public:
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
+  struct KernelParameter {
+    // Declare the member type that the DFA is going to instantiate
+    using LookupTableT = SymbolGroupLookupOp<SymbolGroupLookupOpT>;
+    SymbolGroupLookupOpT sgid_lookup_op;
+  };
+
+  static KernelParameter InitDeviceSymbolGroupIdLut(SymbolGroupLookupOpT sgid_lookup_op)
+  {
+    return KernelParameter{sgid_lookup_op};
+  }
+
+ private:
+  _TempStorage& temp_storage;
+  SymbolGroupLookupOpT sgid_lookup_op;
+
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+ public:
+  CUDF_HOST_DEVICE SymbolGroupLookupOp(KernelParameter const& kernel_param,
+                                       TempStorage& temp_storage)
+    : temp_storage(temp_storage.Alias()), sgid_lookup_op(kernel_param.sgid_lookup_op)
+  {
+  }
+
+  template <typename SymbolT_>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT_ const symbol) const
+  {
+    // Look up the symbol group for given symbol
+    return sgid_lookup_op(symbol);
+  }
+};
+
+/**
+ * @brief Prepares a simple symbol group lookup wrapper that uses a simple function object to
+ * retrieve the symbol group id for a symbol.
+ *
+ * @tparam FunctorT A function object type that must implement the signature `int32_t
+ * operator()(symbol)`, where `symbol` is a symbol from the input type.
+ * @param sgid_lookup_op A function object that must implement the signature `int32_t
+ * operator()(symbol)`, where `symbol` is a symbol from the input type.
+ * @return The kernel parameter of type SymbolGroupLookupOp::KernelParameter that is used to
+ * initialize a simple symbol group id lookup wrapper
+ */
+template <typename FunctorT>
+auto make_symbol_group_lookup_op(FunctorT sgid_lookup_op)
+{
+  return SymbolGroupLookupOp<FunctorT>::InitDeviceSymbolGroupIdLut(sgid_lookup_op);
+}
+
 /**
  * @brief Creates a symbol group lookup table of type `SingleSymbolSmemLUT` that uses a two-staged
  * lookup approach. @p pre_map_op is a function object invoked with `(lut, symbol)` that must return
@@ -830,7 +898,7 @@ class Dfa {
 };
 
 /**
- * @brief Creates a determninistic finite automaton (DFA) as specified by the triple of (symbol
+ * @brief Creates a deterministic finite automaton (DFA) as specified by the triple of (symbol
  * group, transition, translation)-lookup tables to be used with the finite-state transducer
  * algorithm.
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 0629ceb95c6..8552db9a719 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -477,7 +477,7 @@ static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = {
 struct PdaSymbolToSymbolGroupId {
   template <typename SymbolT, typename StackSymbolT>
   __device__ __forceinline__ PdaSymbolGroupIdT
-  operator()(thrust::tuple<SymbolT, StackSymbolT> symbol_pair)
+  operator()(thrust::tuple<SymbolT, StackSymbolT> symbol_pair) const
   {
     // The symbol read from the input
     auto symbol = thrust::get<0>(symbol_pair);
@@ -1420,36 +1420,25 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
 
   // Prepare for PDA transducer pass, merging input symbols with stack symbols
   auto const recover_from_error = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER);
-  rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids = [json_in, stream, recover_from_error]() {
-    // Memory holding the top-of-stack stack context for the input
-    rmm::device_uvector<StackSymbolT> stack_op_indices{json_in.size(), stream};
-
-    // Identify what is the stack context for each input character (JSON-root, struct, or list)
-    auto const stack_behavior = recover_from_error ? stack_behavior_t::ResetOnDelimiter
-                                                   : stack_behavior_t::PushPopWithoutReset;
-    get_stack_context(json_in, stack_op_indices.data(), stack_behavior, stream);
-
-    rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids{json_in.size(), stream};
-    auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data());
-    thrust::transform(rmm::exec_policy(stream),
-                      zip_in,
-                      zip_in + json_in.size(),
-                      pda_sgids.data(),
-                      tokenizer_pda::PdaSymbolToSymbolGroupId{});
-    return pda_sgids;
-  }();
-
-  // Instantiating PDA transducer
-  std::array<std::vector<char>, tokenizer_pda::NUM_PDA_SGIDS> pda_sgid_identity{};
-  std::generate(std::begin(pda_sgid_identity),
-                std::end(pda_sgid_identity),
-                [i = char{0}]() mutable { return std::vector<char>{i++}; });
+
+  // Memory holding the top-of-stack stack context for the input
+  rmm::device_uvector<StackSymbolT> stack_symbols{json_in.size(), stream};
+
+  // Identify what is the stack context for each input character (JSON-root, struct, or list)
+  auto const stack_behavior =
+    recover_from_error ? stack_behavior_t::ResetOnDelimiter : stack_behavior_t::PushPopWithoutReset;
+  get_stack_context(json_in, stack_symbols.data(), stack_behavior, stream);
+
+  // Input to the full pushdown automaton finite-state transducer, where a input symbol comprises
+  // the combination of a character from the JSON input together with the stack context for that
+  // character.
+  auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_symbols.data());
 
   constexpr auto max_translation_table_size =
     tokenizer_pda::NUM_PDA_SGIDS *
     static_cast<tokenizer_pda::StateT>(tokenizer_pda::pda_state_t::PD_NUM_STATES);
   auto json_to_tokens_fst = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(pda_sgid_identity),
+    fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{}),
     fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
     fst::detail::make_translation_table<max_translation_table_size>(
       tokenizer_pda::get_translation_table(recover_from_error)),
@@ -1473,7 +1462,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   rmm::device_uvector<SymbolOffsetT> tokens_indices{
     max_token_out_count + delimiter_offset, stream, mr};
 
-  json_to_tokens_fst.Transduce(pda_sgids.begin(),
+  json_to_tokens_fst.Transduce(zip_in,
                                static_cast<SymbolOffsetT>(json_in.size()),
                                tokens.data() + delimiter_offset,
                                tokens_indices.data() + delimiter_offset,

From 2801a273e3faf330d364dd23d82dd520861f5526 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 10 Aug 2023 09:12:09 -0700
Subject: [PATCH 042/230] Remove Arrow dependency from the `datasource.hpp`
 public header (#13698)

Remove arrow dependency from `datasource.hpp`.

Breaking only because users of `arrow_io_source` now need to include the new `arrow_io_source.hpp` header instead on `datasource.hpp`

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/13698
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/io/arrow_io_source.hpp       |  85 +++++++++++
 cpp/include/cudf/io/datasource.hpp            | 140 ------------------
 cpp/include/cudf/io/types.hpp                 |  15 --
 cpp/src/io/utilities/arrow_io_source.cpp      |  85 +++++++++++
 cpp/src/io/utilities/datasource.cpp           |   3 +
 cpp/tests/io/arrow_io_source_test.cpp         |   9 +-
 cpp/tests/io/csv_test.cpp                     |   2 +-
 cpp/tests/io/json_test.cpp                    |   2 +-
 .../cudf/cudf/_lib/cpp/io/arrow_io_source.pxd |  15 ++
 python/cudf/cudf/_lib/cpp/io/data_sink.pxd    |   8 +
 python/cudf/cudf/_lib/cpp/io/datasource.pxd   |   8 +
 python/cudf/cudf/_lib/cpp/io/types.pxd        |  29 +---
 python/cudf/cudf/_lib/csv.pyx                 |   2 +-
 python/cudf/cudf/_lib/io/datasource.pxd       |   4 +-
 python/cudf/cudf/_lib/io/datasource.pyx       |   3 +-
 python/cudf/cudf/_lib/io/utils.pxd            |  10 +-
 python/cudf/cudf/_lib/io/utils.pyx            |   6 +-
 python/cudf/cudf/_lib/json.pyx                |   2 +-
 python/cudf/cudf/_lib/orc.pyx                 |   2 +-
 python/cudf/cudf/_lib/parquet.pyx             |   5 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |   4 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |   2 +-
 24 files changed, 239 insertions(+), 204 deletions(-)
 create mode 100644 cpp/include/cudf/io/arrow_io_source.hpp
 create mode 100644 cpp/src/io/utilities/arrow_io_source.cpp
 create mode 100644 python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
 create mode 100644 python/cudf/cudf/_lib/cpp/io/data_sink.pxd
 create mode 100644 python/cudf/cudf/_lib/cpp/io/datasource.pxd

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 81026b58e33..de32facba74 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -194,6 +194,7 @@ outputs:
         - test -f $PREFIX/include/cudf/hashing.hpp
         - test -f $PREFIX/include/cudf/hashing/detail/hashing.hpp
         - test -f $PREFIX/include/cudf/interop.hpp
+        - test -f $PREFIX/include/cudf/io/arrow_io_source.hpp
         - test -f $PREFIX/include/cudf/io/avro.hpp
         - test -f $PREFIX/include/cudf/io/csv.hpp
         - test -f $PREFIX/include/cudf/io/data_sink.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d6b2fb10c23..054f3b290a3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -408,6 +408,7 @@ add_library(
   src/io/text/bgzip_data_chunk_source.cu
   src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
+  src/io/utilities/arrow_io_source.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
   src/io/utilities/data_sink.cpp
diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp
new file mode 100644
index 00000000000..5f79f05c5a1
--- /dev/null
+++ b/cpp/include/cudf/io/arrow_io_source.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "datasource.hpp"
+
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/io/interfaces.h>
+
+#include <memory>
+#include <string>
+
+namespace cudf::io {
+/**
+ * @addtogroup io_datasources
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Implementation class for reading from an Apache Arrow file. The file
+ * could be a memory-mapped file or other implementation supported by Arrow.
+ */
+class arrow_io_source : public datasource {
+ public:
+  /**
+   * @brief Constructs an object from an Apache Arrow Filesystem URI
+   *
+   * @param arrow_uri Apache Arrow Filesystem URI
+   */
+  explicit arrow_io_source(std::string const& arrow_uri);
+
+  /**
+   * @brief Constructs an object from an `arrow` source object.
+   *
+   * @param file The `arrow` object from which the data is read
+   */
+  explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file) : arrow_file(file) {}
+
+  /**
+   * @brief Returns a buffer with a subset of data from the `arrow` source.
+   *
+   * @param offset The offset in bytes from which to read
+   * @param size The number of bytes to read
+   * @return A buffer with the read data
+   */
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override;
+
+  /**
+   * @brief Reads a selected range from the `arrow` source into a preallocated buffer.
+   *
+   * @param[in] offset The offset in bytes from which to read
+   * @param[in] size The number of bytes to read
+   * @param[out] dst The preallocated buffer to read into
+   * @return The number of bytes read
+   */
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override;
+  /**
+   * @brief Returns the size of the data in the `arrow` source.
+   *
+   * @return The size of the data in the `arrow` source
+   */
+  [[nodiscard]] size_t size() const override;
+
+ private:
+  std::shared_ptr<arrow::fs::FileSystem> filesystem;
+  std::shared_ptr<arrow::io::RandomAccessFile> arrow_file;
+};
+
+/** @} */  // end of group
+}  // namespace cudf::io
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index eac19dc81cd..28263d466f3 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -22,35 +22,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <arrow/buffer.h>
-
-// We disable warning 611 because some Arrow subclasses of
-// `arrow::fs::FileSystem` only partially override the `Equals` method,
-// triggering warning 611-D from nvcc.
-#ifdef __CUDACC__
-#pragma nv_diag_suppress 611
-#endif
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/filesystem/s3fs.h>
-#ifdef __CUDACC__
-#pragma nv_diag_default 611
-#endif
-
-// We disable warning 2810 to workaround the compile issue (warning treated as error):
-// result.h(263): error #2810-D: ignoring return value type with "nodiscard" attribute
-#ifdef __CUDACC__
-#pragma nv_diag_suppress 2810
-#endif
-#include <arrow/result.h>
-#ifdef __CUDACC__
-#pragma nv_diag_default 2810
-#endif
-
-#include <arrow/io/file.h>
-#include <arrow/io/interfaces.h>
-#include <arrow/io/memory.h>
-#include <arrow/status.h>
-
 #include <future>
 #include <memory>
 
@@ -149,15 +120,6 @@ class datasource {
    */
   static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
 
-  /**
-   * @brief Creates a source from a from an Arrow file.
-   *
-   * @param[in] arrow_file RandomAccessFile to which the API calls are forwarded
-   * @return Constructed datasource object
-   */
-  static std::unique_ptr<datasource> create(
-    std::shared_ptr<arrow::io::RandomAccessFile> arrow_file);
-
   /**
    * @brief Creates a source from an user implemented datasource object.
    *
@@ -412,108 +374,6 @@ class datasource {
   };
 };
 
-/**
- * @brief Implementation class for reading from an Apache Arrow file. The file
- * could be a memory-mapped file or other implementation supported by Arrow.
- */
-class arrow_io_source : public datasource {
-  /**
-   * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data.
-   */
-  class arrow_io_buffer : public buffer {
-    std::shared_ptr<arrow::Buffer> arrow_buffer;
-
-   public:
-    explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer)
-      : arrow_buffer(arrow_buffer)
-    {
-    }
-    [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
-    [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); }
-  };
-
- public:
-  /**
-   * @brief Constructs an object from an Apache Arrow Filesystem URI
-   *
-   * @param arrow_uri Apache Arrow Filesystem URI
-   */
-  explicit arrow_io_source(std::string_view arrow_uri)
-  {
-    std::string const uri_start_delimiter = "//";
-    std::string const uri_end_delimiter   = "?";
-
-    arrow::Result<std::shared_ptr<arrow::fs::FileSystem>> result =
-      arrow::fs::FileSystemFromUri(static_cast<std::string>(arrow_uri));
-    CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI.");
-    filesystem = result.ValueOrDie();
-
-    // Parse the path from the URI
-    size_t start          = arrow_uri.find(uri_start_delimiter) == std::string::npos
-                              ? 0
-                              : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size();
-    size_t end            = arrow_uri.find(uri_end_delimiter) - start;
-    std::string_view path = arrow_uri.substr(start, end);
-
-    arrow::Result<std::shared_ptr<arrow::io::RandomAccessFile>> in_stream =
-      filesystem->OpenInputFile(static_cast<std::string>(path).c_str());
-    CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile");
-    arrow_file = in_stream.ValueOrDie();
-  }
-
-  /**
-   * @brief Constructs an object from an `arrow` source object.
-   *
-   * @param file The `arrow` object from which the data is read
-   */
-  explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file) : arrow_file(file) {}
-
-  /**
-   * @brief Returns a buffer with a subset of data from the `arrow` source.
-   *
-   * @param offset The offset in bytes from which to read
-   * @param size The number of bytes to read
-   * @return A buffer with the read data
-   */
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
-  {
-    auto result = arrow_file->ReadAt(offset, size);
-    CUDF_EXPECTS(result.ok(), "Cannot read file data");
-    return std::make_unique<arrow_io_buffer>(result.ValueOrDie());
-  }
-
-  /**
-   * @brief Reads a selected range from the `arrow` source into a preallocated buffer.
-   *
-   * @param[in] offset The offset in bytes from which to read
-   * @param[in] size The number of bytes to read
-   * @param[out] dst The preallocated buffer to read into
-   * @return The number of bytes read
-   */
-  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
-  {
-    auto result = arrow_file->ReadAt(offset, size, dst);
-    CUDF_EXPECTS(result.ok(), "Cannot read file data");
-    return result.ValueOrDie();
-  }
-
-  /**
-   * @brief Returns the size of the data in the `arrow` source.
-   *
-   * @return The size of the data in the `arrow` source
-   */
-  [[nodiscard]] size_t size() const override
-  {
-    auto result = arrow_file->GetSize();
-    CUDF_EXPECTS(result.ok(), "Cannot get file size");
-    return result.ValueOrDie();
-  }
-
- private:
-  std::shared_ptr<arrow::fs::FileSystem> filesystem;
-  std::shared_ptr<arrow::io::RandomAccessFile> arrow_file;
-};
-
 /** @} */  // end of group
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index b08c50574b6..9b0dcff99af 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -32,13 +32,6 @@
 #include <unordered_map>
 #include <vector>
 
-// Forward declarations
-namespace arrow {
-namespace io {
-class RandomAccessFile;
-}
-}  // namespace arrow
-
 namespace cudf {
 //! IO interfaces
 namespace io {
@@ -286,8 +279,6 @@ constexpr inline auto is_byte_like_type()
  * @brief Source information for read interfaces
  */
 struct source_info {
-  std::vector<std::shared_ptr<arrow::io::RandomAccessFile>> _files;  //!< Input files
-
   source_info() = default;
 
   /**
@@ -438,12 +429,6 @@ struct source_info {
    * @return The device buffers of the input
    */
   [[nodiscard]] auto const& device_buffers() const { return _device_buffers; }
-  /**
-   * @brief Get the input files
-   *
-   * @return The input files
-   */
-  [[nodiscard]] auto const& files() const { return _files; }
   /**
    * @brief Get the user sources of the input
    *
diff --git a/cpp/src/io/utilities/arrow_io_source.cpp b/cpp/src/io/utilities/arrow_io_source.cpp
new file mode 100644
index 00000000000..d647f3c0a4b
--- /dev/null
+++ b/cpp/src/io/utilities/arrow_io_source.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/arrow_io_source.hpp>
+
+#include <arrow/buffer.h>
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/result.h>
+
+#include <memory>
+#include <string>
+
+namespace cudf::io {
+
+/**
+ * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data.
+ */
+class arrow_io_buffer : public datasource::buffer {
+  std::shared_ptr<arrow::Buffer> arrow_buffer;
+
+ public:
+  explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer) : arrow_buffer(arrow_buffer)
+  {
+  }
+  [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
+  [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); }
+};
+
+arrow_io_source::arrow_io_source(std::string const& arrow_uri)
+{
+  std::string const uri_start_delimiter = "//";
+  std::string const uri_end_delimiter   = "?";
+
+  auto const result = arrow::fs::FileSystemFromUri(arrow_uri);
+  CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI.");
+  filesystem = result.ValueOrDie();
+
+  // Parse the path from the URI
+  auto const start = [&]() {
+    auto const delim_start = arrow_uri.find(uri_start_delimiter);
+    return delim_start == std::string::npos ? 0 : delim_start + uri_start_delimiter.size();
+  }();
+  auto const end  = arrow_uri.find(uri_end_delimiter) - start;
+  auto const path = arrow_uri.substr(start, end);
+
+  auto const in_stream = filesystem->OpenInputFile(path);
+  CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile");
+  arrow_file = in_stream.ValueOrDie();
+}
+
+std::unique_ptr<datasource::buffer> arrow_io_source::host_read(size_t offset, size_t size)
+{
+  auto const result = arrow_file->ReadAt(offset, size);
+  CUDF_EXPECTS(result.ok(), "Cannot read file data");
+  return std::make_unique<arrow_io_buffer>(result.ValueOrDie());
+}
+
+size_t arrow_io_source::host_read(size_t offset, size_t size, uint8_t* dst)
+{
+  auto const result = arrow_file->ReadAt(offset, size, dst);
+  CUDF_EXPECTS(result.ok(), "Cannot read file data");
+  return result.ValueOrDie();
+}
+
+[[nodiscard]] size_t arrow_io_source::size() const
+{
+  auto const result = arrow_file->GetSize();
+  CUDF_EXPECTS(result.ok(), "Cannot get file size");
+  return result.ValueOrDie();
+}
+
+}  // namespace cudf::io
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 8aea8b4f69c..6186d9d9736 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -17,6 +17,7 @@
 #include "file_io_utilities.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -25,6 +26,8 @@
 #include <kvikio/file_handle.hpp>
 #include <rmm/device_buffer.hpp>
 
+#include <arrow/io/memory.h>
+
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index 89600d4cb46..979f8e4fb05 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -21,11 +21,12 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/io/datasource.hpp>
+#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/parquet.hpp>
 
 #include <arrow/filesystem/filesystem.h>
+#include <arrow/filesystem/s3fs.h>
 #include <arrow/io/api.h>
 #include <arrow/util/config.h>
 
@@ -48,8 +49,7 @@ TEST_F(ArrowIOTest, URIFileSystem)
   outfile.close();
 
   std::string file_uri = "file://" + file_name;
-  std::unique_ptr<cudf::io::arrow_io_source> datasource =
-    std::make_unique<cudf::io::arrow_io_source>(file_uri);
+  auto datasource      = std::make_unique<cudf::io::arrow_io_source>(file_uri);
 
   // Populate the JSON Reader Options
   cudf::io::json_reader_options options =
@@ -72,8 +72,7 @@ TEST_F(ArrowIOTest, S3FileSystem)
   if (s3_unsupported) {
     EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
   } else {
-    std::unique_ptr<cudf::io::arrow_io_source> datasource =
-      std::make_unique<cudf::io::arrow_io_source>(s3_uri);
+    auto datasource = std::make_unique<cudf::io::arrow_io_source>(s3_uri);
 
     // Populate the Parquet Reader Options
     cudf::io::source_info src(datasource.get());
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 2b501f45b47..8922658ac97 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -23,8 +23,8 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/csv.hpp>
-#include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 5a30be755d3..220f1a3391f 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -23,7 +23,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/io/datasource.hpp>
+#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
new file mode 100644
index 00000000000..4aef4841844
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport shared_ptr
+from libcpp.string cimport string
+from pyarrow.includes.libarrow cimport CRandomAccessFile
+
+cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
+
+
+cdef extern from "cudf/io/arrow_io_source.hpp" \
+        namespace "cudf::io" nogil:
+
+    cdef cppclass arrow_io_source(cudf_io_datasource.datasource):
+        arrow_io_source(const string& arrow_uri) except +
+        arrow_io_source(shared_ptr[CRandomAccessFile]) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/data_sink.pxd b/python/cudf/cudf/_lib/cpp/io/data_sink.pxd
new file mode 100644
index 00000000000..e939a47d7f9
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/data_sink.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+cdef extern from "cudf/io/data_sink.hpp" \
+        namespace "cudf::io" nogil:
+
+    cdef cppclass data_sink:
+        pass
diff --git a/python/cudf/cudf/_lib/cpp/io/datasource.pxd b/python/cudf/cudf/_lib/cpp/io/datasource.pxd
new file mode 100644
index 00000000000..c69aa65bd3c
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/datasource.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+cdef extern from "cudf/io/datasource.hpp" \
+        namespace "cudf::io" nogil:
+
+    cdef cppclass datasource:
+        pass
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index b2b0a77c45f..01eaca82692 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -10,6 +10,8 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
+cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
+cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.types cimport size_type
@@ -105,35 +107,18 @@ cdef extern from "cudf/io/types.hpp" \
         source_info() except +
         source_info(const vector[string] &filepaths) except +
         source_info(const vector[host_buffer] &host_buffers) except +
-        source_info(datasource *source) except +
-        source_info(const vector[datasource*] &datasources) except +
+        source_info(cudf_io_datasource.datasource *source) except +
+        source_info(const vector[cudf_io_datasource.datasource*] &datasources) except +
 
     cdef cppclass sink_info:
         io_type type
         const vector[string]& filepaths()
         const vector[vector[char] *]& buffers()
-        const vector[data_sink *]& user_sinks()
+        const vector[cudf_io_data_sink.data_sink *]& user_sinks()
 
         sink_info() except +
         sink_info(string file_path) except +
         sink_info(vector[string] file_path) except +
         sink_info(vector[char] * buffer) except +
-        sink_info(data_sink * user_sink) except +
-        sink_info(vector[data_sink *] user_sink) except +
-
-
-cdef extern from "cudf/io/data_sink.hpp" \
-        namespace "cudf::io" nogil:
-
-    cdef cppclass data_sink:
-        pass
-
-cdef extern from "cudf/io/datasource.hpp" \
-        namespace "cudf::io" nogil:
-
-    cdef cppclass datasource:
-        pass
-
-    cdef cppclass arrow_io_source(datasource):
-        arrow_io_source(string arrow_uri) except +
-        arrow_io_source(shared_ptr[CRandomAccessFile]) except +
+        sink_info(cudf_io_data_sink.data_sink * user_sink) except +
+        sink_info(vector[cudf_io_data_sink.data_sink *] user_sink) except +
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 3f275e2635f..630dcf73545 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -35,9 +35,9 @@ from cudf._lib.cpp.io.csv cimport (
     read_csv as cpp_read_csv,
     write_csv as cpp_write_csv,
 )
+from cudf._lib.cpp.io.data_sink cimport data_sink
 from cudf._lib.cpp.io.types cimport (
     compression_type,
-    data_sink,
     quote_style,
     sink_info,
     source_info,
diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/io/datasource.pxd
index e8fe79d2685..bd5bf0227a5 100644
--- a/python/cudf/cudf/_lib/io/datasource.pxd
+++ b/python/cudf/cudf/_lib/io/datasource.pxd
@@ -2,12 +2,14 @@
 
 from libcpp.memory cimport shared_ptr
 
-from cudf._lib.cpp.io.types cimport arrow_io_source, datasource
+from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.cpp.io.datasource cimport datasource
 
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil
 
+
 cdef class NativeFileDatasource(Datasource):
     cdef shared_ptr[arrow_io_source] c_datasource
     cdef datasource* get_datasource(self) nogil
diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/io/datasource.pyx
index b39a1aee9b8..5cadd58d8d3 100644
--- a/python/cudf/cudf/_lib/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/io/datasource.pyx
@@ -4,7 +4,8 @@ from libcpp.memory cimport shared_ptr
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pyarrow.lib cimport NativeFile
 
-from cudf._lib.cpp.io.types cimport arrow_io_source, datasource
+from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.cpp.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index af1f2521d4a..2c2d52b512b 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -1,15 +1,11 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.types cimport (
-    column_name_info,
-    data_sink,
-    sink_info,
-    source_info,
-)
+from cudf._lib.cpp.io.data_sink cimport data_sink
+from cudf._lib.cpp.io.types cimport column_name_info, sink_info, source_info
 
 
 cdef source_info make_source_info(list src) except*
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 7dbe395be79..9b027a4d275 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cpython.buffer cimport PyBUF_READ
 from cpython.memoryview cimport PyMemoryView_FromMemory
@@ -8,10 +8,10 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
+from cudf._lib.cpp.io.data_sink cimport data_sink
+from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.cpp.io.types cimport (
     column_name_info,
-    data_sink,
-    datasource,
     host_buffer,
     sink_info,
     source_info,
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 611baed7fd7..64189a5626f 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -17,6 +17,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
+from cudf._lib.cpp.io.data_sink cimport data_sink
 from cudf._lib.cpp.io.json cimport (
     json_reader_options,
     json_writer_options,
@@ -27,7 +28,6 @@ from cudf._lib.cpp.io.json cimport (
 from cudf._lib.cpp.io.types cimport (
     column_name_info,
     compression_type,
-    data_sink,
     sink_info,
     table_metadata,
     table_with_metadata,
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index dfe5bcf9d53..0ae039b14d2 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -21,6 +21,7 @@ except ImportError:
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 from cudf._lib.column cimport Column
+from cudf._lib.cpp.io.data_sink cimport data_sink
 from cudf._lib.cpp.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
@@ -36,7 +37,6 @@ from cudf._lib.cpp.io.orc_metadata cimport (
 from cudf._lib.cpp.io.types cimport (
     column_in_metadata,
     compression_type,
-    data_sink,
     sink_info,
     source_info,
     table_input_metadata,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d297c80ab5a..85fd25cf1a9 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -38,6 +38,7 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.types as cudf_types
 from cudf._lib.column cimport Column
@@ -334,7 +335,7 @@ def write_parquet(
 
     cdef vector[map[string, string]] user_data
     cdef table_view tv
-    cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sinks
+    cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sinks
     cdef cudf_io_types.sink_info sink = make_sinks_info(
         filepaths_or_buffers, _data_sinks
     )
@@ -476,7 +477,7 @@ cdef class ParquetWriter:
     cdef unique_ptr[cpp_parquet_chunked_writer] writer
     cdef table_input_metadata tbl_meta
     cdef cudf_io_types.sink_info sink
-    cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sink
+    cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sink
     cdef cudf_io_types.statistics_freq stat_freq
     cdef cudf_io_types.compression_type comp_type
     cdef object index
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index e64d8f82739..ca729c62512 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
@@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.types cimport datasource
+from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
 
 
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 52278188281..4d732478723 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.io.types cimport datasource
+from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.cpp.libcpp.memory cimport make_unique
 
 from cudf_kafka._lib.kafka cimport kafka_consumer

From b743cc7bff33584434705d41ef452718abdc7ce6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 10 Aug 2023 11:45:00 -0500
Subject: [PATCH 043/230] Fix binary operations between `Series` and `Index`
 (#13842)

Fixes: #13836

This PR fixes binary operations between `Series` & `Index` (and vice-versa) by properly handling the return type and the names being assigned to the return results. This PR also adds an early exit that will return `False` for `eq` operations when the other column being compared is not of the same type.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13842
---
 python/cudf/cudf/core/_base_index.py         |  6 +-
 python/cudf/cudf/core/column/datetime.py     | 22 ++++--
 python/cudf/cudf/core/column/timedelta.py    |  9 ++-
 python/cudf/cudf/core/index.py               | 22 +++++-
 python/cudf/cudf/core/multiindex.py          |  4 +-
 python/cudf/cudf/core/series.py              |  7 +-
 python/cudf/cudf/core/single_column_frame.py |  4 +-
 python/cudf/cudf/tests/test_binops.py        | 72 ++++++++++++++++++++
 python/cudf/cudf/utils/utils.py              | 47 +++++++++++++
 9 files changed, 174 insertions(+), 19 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 981a8323138..d593f0df138 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -31,6 +31,7 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils import ioutils
 from cudf.utils.dtypes import is_mixed_with_object_dtype
+from cudf.utils.utils import _is_same_name
 
 
 class BaseIndex(Serializable):
@@ -2010,7 +2011,4 @@ def _split(self, splits):
 
 
 def _get_result_name(left_name, right_name):
-    if left_name == right_name:
-        return left_name
-    else:
-        return None
+    return left_name if _is_same_name(left_name, right_name) else None
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 84d283d3f22..8073092775d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -31,7 +31,7 @@
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
-from cudf.utils.utils import _fillna_natwise
+from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
 
 _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
 
@@ -424,12 +424,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         )
         lhs, rhs = (other, self) if reflect else (self, other)
         out_dtype = None
-        if op in {
-            "__eq__",
-            "NULL_EQUALS",
-        }:
-            out_dtype = cudf.dtype(np.bool_)
-        elif (
+
+        if (
             op
             in {
                 "__ne__",
@@ -455,6 +451,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             # well-defined if this operation was not invoked via reflection.
             elif other_is_timedelta and not reflect:
                 out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64")
+        elif op in {
+            "__eq__",
+            "NULL_EQUALS",
+            "__ne__",
+        }:
+            out_dtype = cudf.dtype(np.bool_)
+            if isinstance(other, ColumnBase) and not isinstance(
+                other, DatetimeColumn
+            ):
+                return _all_bools_with_nulls(
+                    self, other, bool_fill_value=op == "__ne__"
+                )
 
         if out_dtype is None:
             return NotImplemented
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 272f6e20985..b571461b307 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -16,7 +16,7 @@
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
-from cudf.utils.utils import _fillna_natwise
+from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
 
 _dtype_to_format_conversion = {
     "timedelta64[ns]": "%D days %H:%M:%S",
@@ -202,6 +202,13 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         elif other.dtype.kind in {"f", "i", "u"}:
             if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}:
                 out_dtype = self.dtype
+            elif op in {"__eq__", "NULL_EQUALS", "__ne__"}:
+                if isinstance(other, ColumnBase) and not isinstance(
+                    other, TimeDeltaColumn
+                ):
+                    return _all_bools_with_nulls(
+                        self, other, bool_fill_value=op == "__ne__"
+                    )
 
         if out_dtype is None:
             return NotImplemented
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 44a1620da8a..fa9e63fa4cc 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -65,6 +65,7 @@
 )
 from cudf.utils.utils import (
     _cudf_nvtx_annotate,
+    _is_same_name,
     _warn_no_dask_cudf,
     search_range,
 )
@@ -1029,12 +1030,27 @@ def _binaryop(
         operands = self._make_operands_for_binop(other, fill_value, reflect)
         if operands is NotImplemented:
             return NotImplemented
-        ret = _index_from_data(self._colwise_binop(operands, op))
+        binop_result = self._colwise_binop(operands, op)
+
+        if isinstance(other, cudf.Series):
+            ret = other._from_data_like_self(binop_result)
+            ret.name = (
+                self.name
+                if cudf.utils.utils._is_same_name(self.name, other.name)
+                else None
+            )
+        else:
+            ret = _index_from_data(binop_result)
 
         # pandas returns numpy arrays when the outputs are boolean. We
         # explicitly _do not_ use isinstance here: we want only boolean
         # GenericIndexes, not dtype-specific subclasses.
-        if type(ret) is GenericIndex and ret.dtype.kind == "b":
+        if (
+            isinstance(ret, (GenericIndex, cudf.Series))
+            and ret.dtype.kind == "b"
+        ):
+            if ret._column.has_nulls():
+                ret = ret.fillna(op == "__ne__")
             return ret.values
         return ret
 
@@ -3309,7 +3325,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
     if isinstance(arbitrary, cudf.MultiIndex):
         return arbitrary
     elif isinstance(arbitrary, BaseIndex):
-        if arbitrary.name == kwargs["name"]:
+        if _is_same_name(arbitrary.name, kwargs["name"]):
             return arbitrary
         idx = arbitrary.copy(deep=False)
         idx.rename(kwargs["name"], inplace=True)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 9285a21f696..bb0c25a9970 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -25,7 +25,7 @@
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
-from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate, _is_same_name
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
@@ -1891,7 +1891,7 @@ def _maybe_match_names(self, other):
         if len(self.names) != len(other.names):
             return [None] * len(self.names)
         return [
-            self_name if self_name == other_name else None
+            self_name if _is_same_name(self_name, other_name) else None
             for self_name, other_name in zip(self.names, other.names)
         ]
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 6ff0584538e..a0cec69eca9 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1505,9 +1505,14 @@ def _make_operands_and_index_for_binop(
                     "Can only compare identically-labeled Series objects"
                 )
             lhs, other = _align_indices([self, other], allow_non_unique=True)
-            can_use_self_column_name = self.name == other.name
         else:
             lhs = self
+
+        try:
+            can_use_self_column_name = cudf.utils.utils._is_same_name(
+                self.name, other.name
+            )
+        except AttributeError:
             can_use_self_column_name = False
 
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index ffb432ed14a..7c019f0722c 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -327,7 +327,9 @@ def _make_operands_for_binop(
         # Get the appropriate name for output operations involving two objects
         # that are Series-like objects. The output shares the lhs's name unless
         # the rhs is a _differently_ named Series-like object.
-        if isinstance(other, SingleColumnFrame) and self.name != other.name:
+        if isinstance(
+            other, SingleColumnFrame
+        ) and not cudf.utils.utils._is_same_name(self.name, other.name):
             result_name = None
         else:
             result_name = self.name
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 6307c3655c1..4965fb728c2 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -3,6 +3,7 @@
 import decimal
 import operator
 import random
+import warnings
 from itertools import combinations_with_replacement, product
 
 import cupy as cp
@@ -149,6 +150,33 @@
     lambda x: cudf.Scalar(0) / x,
 ]
 
+_series_or_index_names = [
+    None,
+    pd.NA,
+    cudf.NA,
+    np.nan,
+    float("NaN"),
+    "abc",
+    1,
+    pd.NaT,
+    np.datetime64("nat"),
+    np.timedelta64("NaT"),
+    np.timedelta64(10, "D"),
+    np.timedelta64(5, "D"),
+    np.datetime64("1970-01-01 00:00:00.000000001"),
+    np.datetime64("1970-01-01 00:00:00.000000002"),
+    pd.Timestamp(1),
+    pd.Timestamp(2),
+    pd.Timedelta(1),
+    pd.Timedelta(2),
+    decimal.Decimal("NaN"),
+    decimal.Decimal("1.2"),
+    np.int64(1),
+    np.int32(1),
+    np.float32(1),
+    pd.Timestamp(1),
+]
+
 pytest_xfail = pytest.mark.xfail
 pytestmark = pytest.mark.spilling
 
@@ -3265,3 +3293,47 @@ def test_binop_integer_power_int_scalar():
     expected = base**exponent.value
     got = base**exponent
     utils.assert_eq(expected, got)
+
+
+@pytest.mark.parametrize("op", _binops)
+def test_binop_index_series(op):
+    gi = cudf.Index([10, 11, 12])
+    gs = cudf.Series([1, 2, 3])
+
+    actual = op(gi, gs)
+    expected = op(gi.to_pandas(), gs.to_pandas())
+
+    utils.assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("name1", _series_or_index_names)
+@pytest.mark.parametrize("name2", _series_or_index_names)
+def test_binop_index_dt_td_series_with_names(name1, name2):
+    gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]", name=name1)
+    gs = cudf.Series([10, 11, 12], dtype="timedelta64[ns]", name=name2)
+    with warnings.catch_warnings():
+        # Numpy raises a deprecation warning:
+        # "elementwise comparison failed; this will raise an error "
+        warnings.simplefilter("ignore", (DeprecationWarning,))
+
+        expected = gi.to_pandas() + gs.to_pandas()
+    actual = gi + gs
+
+    utils.assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data1", [[1, 2, 3], [10, 11, None]])
+@pytest.mark.parametrize("data2", [[1, 2, 3], [10, 11, None]])
+def test_binop_eq_ne_index_series(data1, data2):
+    gi = cudf.Index(data1, dtype="datetime64[ns]", name=np.nan)
+    gs = cudf.Series(data2, dtype="timedelta64[ns]", name="abc")
+
+    actual = gi == gs
+    expected = gi.to_pandas() == gs.to_pandas()
+
+    utils.assert_eq(expected, actual)
+
+    actual = gi != gs
+    expected = gi.to_pandas() != gs.to_pandas()
+
+    utils.assert_eq(expected, actual)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 9298d283b64..dadd9aa2cc4 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+import decimal
 import functools
 import hashlib
 import os
@@ -384,3 +385,49 @@ def wrapper(self):
         return fn(self)
 
     return wrapper
+
+
+def _is_same_name(left_name, right_name):
+    # Internal utility to compare if two names are same.
+    with warnings.catch_warnings():
+        # numpy throws warnings while comparing
+        # NaT values with non-NaT values.
+        warnings.simplefilter("ignore")
+        try:
+            same = (left_name is right_name) or (left_name == right_name)
+            if not same:
+                if isinstance(left_name, decimal.Decimal) and isinstance(
+                    right_name, decimal.Decimal
+                ):
+                    return left_name.is_nan() and right_name.is_nan()
+                if isinstance(left_name, float) and isinstance(
+                    right_name, float
+                ):
+                    return np.isnan(left_name) and np.isnan(right_name)
+                if isinstance(left_name, np.datetime64) and isinstance(
+                    right_name, np.datetime64
+                ):
+                    return np.isnan(left_name) and np.isnan(right_name)
+            return same
+        except TypeError:
+            return False
+
+
+def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
+    # Internal utility to construct a boolean column
+    # by combining nulls from `lhs` & `rhs`.
+    if lhs.has_nulls() and rhs.has_nulls():
+        result_mask = lhs._get_mask_as_column() & rhs._get_mask_as_column()
+    elif lhs.has_nulls():
+        result_mask = lhs._get_mask_as_column()
+    elif rhs.has_nulls():
+        result_mask = rhs._get_mask_as_column()
+    else:
+        result_mask = None
+
+    result_col = column.full(
+        size=len(lhs), fill_value=bool_fill_value, dtype=cudf.dtype(np.bool_)
+    )
+    if result_mask is not None:
+        result_col = result_col.set_mask(result_mask.as_mask())
+    return result_col

From 6fea2df2cb898b22c2a31b8866d02d15c76a234d Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 11 Aug 2023 13:55:23 -0400
Subject: [PATCH 044/230] Simplify implementation of interval_range() and fix
 behaviour for floating `freq` (#13844)

Closes #13843
Closes https://github.com/rapidsai/cudf/issues/13847

This PR simplifies the implementation of `interval_range()` and fixes a few different bugs in the process. It also moves all tests for interval indexes to `tests/indexes/test_interval.py`. Finally, while working on this PR, I ran into #13847; a fix for that is also included in this PR.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13844
---
 python/cudf/cudf/core/_compat.py              |   1 +
 python/cudf/cudf/core/index.py                | 120 +++----
 python/cudf/cudf/core/scalar.py               |  17 +-
 .../cudf/cudf/tests/indexes/test_interval.py  | 292 ++++++++++++++++++
 python/cudf/cudf/tests/test_binops.py         |   4 +
 python/cudf/cudf/tests/test_index.py          | 250 ---------------
 6 files changed, 350 insertions(+), 334 deletions(-)

diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index 6ecbe414ebb..888b94e070c 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -10,3 +10,4 @@
 PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0")
 PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3")
 PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0")
+PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0")
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index fa9e63fa4cc..c3d468e5656 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -47,7 +47,6 @@
     StringColumn,
     StructColumn,
     TimeDeltaColumn,
-    arange,
     column,
 )
 from cudf.core.column.column import as_column, concat_columns
@@ -3007,89 +3006,56 @@ def interval_range(
             closed='left',
             dtype='interval')
     """
-    if freq and periods and start and end:
+    nargs = sum(_ is not None for _ in (start, end, periods, freq))
+
+    # we need at least three of (start, end, periods, freq)
+    if nargs == 2 and freq is None:
+        freq = 1
+        nargs += 1
+
+    if nargs != 3:
         raise ValueError(
             "Of the four parameters: start, end, periods, and "
             "freq, exactly three must be specified"
         )
-    args = [
-        cudf.Scalar(x) if x is not None else None
-        for x in (start, end, freq, periods)
-    ]
+
+    start = cudf.Scalar(start) if start is not None else start
+    end = cudf.Scalar(end) if end is not None else end
+    periods = cudf.Scalar(int(periods)) if periods is not None else periods
+    freq = cudf.Scalar(freq) if freq is not None else freq
+
+    if start is None:
+        start = end - freq * periods
+    elif freq is None:
+        quotient, remainder = divmod((end - start).value, periods.value)
+        if remainder:
+            freq = (end - start) / periods
+        else:
+            freq = cudf.Scalar(int(quotient))
+    elif periods is None:
+        periods = cudf.Scalar(int((end - start) / freq))
+    elif end is None:
+        end = start + periods * freq
+
     if any(
-        not _is_non_decimal_numeric_dtype(x.dtype) if x is not None else False
-        for x in args
+        not _is_non_decimal_numeric_dtype(x.dtype)
+        for x in (start, periods, freq, end)
     ):
         raise ValueError("start, end, periods, freq must be numeric values.")
-    *rargs, periods = args
-    common_dtype = find_common_type([x.dtype for x in rargs if x])
-    start, end, freq = rargs
-    periods = periods.astype("int64") if periods is not None else None
-
-    if periods and not freq:
-        # if statement for mypy to pass
-        if end is not None and start is not None:
-            # divmod only supported on host side scalars
-            quotient, remainder = divmod((end - start).value, periods.value)
-            if remainder:
-                freq_step = cudf.Scalar((end - start) / periods)
-            else:
-                freq_step = cudf.Scalar(quotient)
-            if start.dtype != freq_step.dtype:
-                start = start.astype(freq_step.dtype)
-            bin_edges = sequence(
-                size=periods + 1,
-                init=start.device_value,
-                step=freq_step.device_value,
-            )
-            left_col = bin_edges.slice(0, len(bin_edges) - 1)
-            right_col = bin_edges.slice(1, len(bin_edges))
-    elif freq and periods:
-        if end:
-            start = end - (freq * periods)
-        if start:
-            end = freq * periods + start
-        if end is not None and start is not None:
-            left_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-            end = end + 1
-            start = start + freq
-            right_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-    elif freq and not periods:
-        if end is not None and start is not None:
-            end = end - freq + 1
-            left_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-            end = end + freq + 1
-            start = start + freq
-            right_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-    elif start is not None and end is not None:
-        # if statements for mypy to pass
-        if freq:
-            left_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-        else:
-            left_col = arange(start.value, end.value, dtype=common_dtype)
-        start = start + 1
-        end = end + 1
-        if freq:
-            right_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-        else:
-            right_col = arange(start.value, end.value, dtype=common_dtype)
-    else:
-        raise ValueError(
-            "Of the four parameters: start, end, periods, and "
-            "freq, at least two must be specified"
-        )
+
+    periods = periods.astype("int64")
+    common_dtype = find_common_type((start.dtype, freq.dtype, end.dtype))
+    start = start.astype(common_dtype)
+    freq = freq.astype(common_dtype)
+
+    bin_edges = sequence(
+        size=periods + 1,
+        init=start.device_value,
+        step=freq.device_value,
+    )
+    left_col = bin_edges.slice(0, len(bin_edges) - 1)
+    right_col = bin_edges.slice(1, len(bin_edges))
+
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
         data = column.column_empty_like_same_mask(left_col, dtype)
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index e516177ad29..438e3c7477d 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import decimal
 import operator
@@ -353,14 +353,17 @@ def _binaryop(self, other, op: str):
 
     def _dispatch_scalar_binop(self, other, op):
         if isinstance(other, Scalar):
-            other = other.value
+            rhs = other.value
+        else:
+            rhs = other
+        lhs = self.value
+        reflect, op = self._check_reflected_op(op)
+        if reflect:
+            lhs, rhs = rhs, lhs
         try:
-            func = getattr(operator, op)
+            return getattr(operator, op)(lhs, rhs)
         except AttributeError:
-            func = getattr(self.value, op)
-        else:
-            return func(self.value, other)
-        return func(other)
+            return getattr(lhs, op)(rhs)
 
     def _unaop_result_type_or_error(self, op):
         if op == "__neg__" and self.dtype == "bool":
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index f80f6d8bb72..4f435f74d59 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -1,8 +1,12 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
+import numpy as np
 import pandas as pd
 import pyarrow as pa
+import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_210
+from cudf.core.index import IntervalIndex, interval_range
 from cudf.testing._utils import assert_eq
 
 
@@ -16,3 +20,291 @@ def test_interval_to_arrow():
     expect = pa.Array.from_pandas(pd.IntervalIndex([pd.Interval(0, 1)]))
     got = cudf.IntervalIndex([pd.Interval(0, 1)]).to_arrow()
     assert_eq(expect, got)
+
+
+INTERVAL_BOUNDARY_TYPES = [
+    int,
+    np.int8,
+    np.int16,
+    np.int32,
+    np.int64,
+    np.float32,
+    np.float64,
+    cudf.Scalar,
+]
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("start", [0, 1, 2, 3])
+@pytest.mark.parametrize("end", [4, 5, 6, 7])
+def test_interval_range_basic(start, end, closed):
+    pindex = pd.interval_range(start=start, end=end, closed=closed)
+    gindex = cudf.interval_range(start=start, end=end, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_dtype_basic(start_t, end_t):
+    start, end = start_t(24), end_t(42)
+    start_val = start.value if isinstance(start, cudf.Scalar) else start
+    end_val = end.value if isinstance(end, cudf.Scalar) else end
+    pindex = pd.interval_range(start=start_val, end=end_val, closed="left")
+    gindex = cudf.interval_range(start=start, end=end, closed="left")
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("start", [0])
+@pytest.mark.parametrize("end", [0])
+def test_interval_range_empty(start, end, closed):
+    pindex = pd.interval_range(start=start, end=end, closed=closed)
+    gindex = cudf.interval_range(start=start, end=end, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("freq", [1, 2, 3])
+@pytest.mark.parametrize("start", [0, 1, 2, 3, 5])
+@pytest.mark.parametrize("end", [6, 8, 10, 43, 70])
+def test_interval_range_freq_basic(start, end, freq, closed):
+    pindex = pd.interval_range(start=start, end=end, freq=freq, closed=closed)
+    gindex = cudf.interval_range(
+        start=start, end=end, freq=freq, closed=closed
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t):
+    start, end, freq = start_t(5), end_t(70), freq_t(3)
+    start_val = start.value if isinstance(start, cudf.Scalar) else start
+    end_val = end.value if isinstance(end, cudf.Scalar) else end
+    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
+    pindex = pd.interval_range(
+        start=start_val, end=end_val, freq=freq_val, closed="left"
+    )
+    gindex = cudf.interval_range(
+        start=start, end=end, freq=freq, closed="left"
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("periods", [1, 1.0, 2, 2.0, 3.0, 3])
+@pytest.mark.parametrize("start", [0, 0.0, 1.0, 1, 2, 2.0, 3.0, 3])
+@pytest.mark.parametrize("end", [4, 4.0, 5.0, 5, 6, 6.0, 7.0, 7])
+def test_interval_range_periods_basic(start, end, periods, closed):
+    pindex = pd.interval_range(
+        start=start, end=end, periods=periods, closed=closed
+    )
+    gindex = cudf.interval_range(
+        start=start, end=end, periods=periods, closed=closed
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t):
+    start, end, periods = start_t(0), end_t(4), periods_t(1.0)
+    start_val = start.value if isinstance(start, cudf.Scalar) else start
+    end_val = end.value if isinstance(end, cudf.Scalar) else end
+    periods_val = (
+        periods.value if isinstance(periods, cudf.Scalar) else periods
+    )
+    pindex = pd.interval_range(
+        start=start_val, end=end_val, periods=periods_val, closed="left"
+    )
+    gindex = cudf.interval_range(
+        start=start, end=end, periods=periods, closed="left"
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("periods", [1, 2, 3])
+@pytest.mark.parametrize("freq", [1, 2, 3, 4])
+@pytest.mark.parametrize("end", [4, 8, 9, 10])
+def test_interval_range_periods_freq_end(end, freq, periods, closed):
+    pindex = pd.interval_range(
+        end=end, freq=freq, periods=periods, closed=closed
+    )
+    gindex = cudf.interval_range(
+        end=end, freq=freq, periods=periods, closed=closed
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t):
+    periods, freq, end = periods_t(2), freq_t(3), end_t(10)
+    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
+    end_val = end.value if isinstance(end, cudf.Scalar) else end
+    periods_val = (
+        periods.value if isinstance(periods, cudf.Scalar) else periods
+    )
+    pindex = pd.interval_range(
+        end=end_val, freq=freq_val, periods=periods_val, closed="left"
+    )
+    gindex = cudf.interval_range(
+        end=end, freq=freq, periods=periods, closed="left"
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("periods", [1, 2, 3])
+@pytest.mark.parametrize("freq", [1, 2, 3, 4])
+@pytest.mark.parametrize("start", [1, 4, 9, 12])
+def test_interval_range_periods_freq_start(start, freq, periods, closed):
+    pindex = pd.interval_range(
+        start=start, freq=freq, periods=periods, closed=closed
+    )
+    gindex = cudf.interval_range(
+        start=start, freq=freq, periods=periods, closed=closed
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_periods_freq_start_dtype(periods_t, freq_t, start_t):
+    periods, freq, start = periods_t(2), freq_t(3), start_t(9)
+    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
+    start_val = start.value if isinstance(start, cudf.Scalar) else start
+    periods_val = (
+        periods.value if isinstance(periods, cudf.Scalar) else periods
+    )
+    pindex = pd.interval_range(
+        start=start_val, freq=freq_val, periods=periods_val, closed="left"
+    )
+    gindex = cudf.interval_range(
+        start=start, freq=freq, periods=periods, closed="left"
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
+@pytest.mark.parametrize(
+    "data",
+    [
+        ([pd.Interval(30, 50)]),
+        ([pd.Interval(0, 3), pd.Interval(1, 7)]),
+        ([pd.Interval(0.2, 60.3), pd.Interval(1, 7), pd.Interval(0, 0)]),
+        ([]),
+    ],
+)
+def test_interval_index_basic(data, closed):
+    pindex = pd.IntervalIndex(data, closed=closed)
+    gindex = IntervalIndex(data, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
+def test_interval_index_empty(closed):
+    pindex = pd.IntervalIndex([], closed=closed)
+    gindex = IntervalIndex([], closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
+@pytest.mark.parametrize(
+    "data",
+    [
+        ([pd.Interval(1, 6), pd.Interval(1, 10), pd.Interval(1, 3)]),
+        (
+            [
+                pd.Interval(3.5, 6.0),
+                pd.Interval(1.0, 7.0),
+                pd.Interval(0.0, 10.0),
+            ]
+        ),
+        (
+            [
+                pd.Interval(50, 100, closed="left"),
+                pd.Interval(1.0, 7.0, closed="left"),
+                pd.Interval(16, 322, closed="left"),
+            ]
+        ),
+        (
+            [
+                pd.Interval(50, 100, closed="right"),
+                pd.Interval(1.0, 7.0, closed="right"),
+                pd.Interval(16, 322, closed="right"),
+            ]
+        ),
+    ],
+)
+def test_interval_index_many_params(data, closed):
+    pindex = pd.IntervalIndex(data, closed=closed)
+    gindex = IntervalIndex(data, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+def test_interval_index_from_breaks(closed):
+    breaks = [0, 3, 6, 10]
+    pindex = pd.IntervalIndex.from_breaks(breaks, closed=closed)
+    gindex = IntervalIndex.from_breaks(breaks, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize(
+    "start, stop, freq, periods",
+    [
+        (0.0, None, 0.2, 5),
+        (0.0, 1.0, None, 5),
+        pytest.param(
+            0.0,
+            1.0,
+            0.2,
+            None,
+            marks=pytest.mark.xfail(
+                condition=not PANDAS_GE_210,
+                reason="https://github.com/pandas-dev/pandas/pull/54477",
+            ),
+        ),
+        (None, 1.0, 0.2, 5),
+        pytest.param(
+            0.0,
+            1.0,
+            0.1,
+            None,
+            marks=pytest.mark.xfail(
+                condition=not PANDAS_GE_210,
+                reason="https://github.com/pandas-dev/pandas/pull/54477",
+            ),
+        ),
+        (0.0, 1.0, None, 10),
+        (0.0, None, 0.25, 4),
+        (1.0, None, 2.5, 2),
+    ],
+)
+def test_interval_range_floating(start, stop, freq, periods):
+    expected = pd.interval_range(
+        start=start, end=stop, freq=freq, periods=periods
+    )
+    got = interval_range(start=start, end=stop, freq=freq, periods=periods)
+    assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 4965fb728c2..65a15d7c567 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -3295,6 +3295,10 @@ def test_binop_integer_power_int_scalar():
     utils.assert_eq(expected, got)
 
 
+def test_numpy_int_scalar_binop():
+    assert (np.float32(1.0) - cudf.Scalar(1)) == 0.0
+
+
 @pytest.mark.parametrize("op", _binops)
 def test_binop_index_series(op):
     gi = cudf.Index([10, 11, 12])
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 62b58fc3d1a..4efd7db4bc5 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -16,7 +16,6 @@
     CategoricalIndex,
     DatetimeIndex,
     GenericIndex,
-    IntervalIndex,
     RangeIndex,
     as_index,
 )
@@ -1352,255 +1351,6 @@ def test_categorical_index_basic(data, categories, dtype, ordered, name):
     assert_eq(pindex, gindex)
 
 
-INTERVAL_BOUNDARY_TYPES = [
-    int,
-    np.int8,
-    np.int16,
-    np.int32,
-    np.int64,
-    np.float32,
-    np.float64,
-    cudf.Scalar,
-]
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("start", [0, 1, 2, 3])
-@pytest.mark.parametrize("end", [4, 5, 6, 7])
-def test_interval_range_basic(start, end, closed):
-    pindex = pd.interval_range(start=start, end=end, closed=closed)
-    gindex = cudf.interval_range(start=start, end=end, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_dtype_basic(start_t, end_t):
-    start, end = start_t(24), end_t(42)
-    start_val = start.value if isinstance(start, cudf.Scalar) else start
-    end_val = end.value if isinstance(end, cudf.Scalar) else end
-    pindex = pd.interval_range(start=start_val, end=end_val, closed="left")
-    gindex = cudf.interval_range(start=start, end=end, closed="left")
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("start", [0])
-@pytest.mark.parametrize("end", [0])
-def test_interval_range_empty(start, end, closed):
-    pindex = pd.interval_range(start=start, end=end, closed=closed)
-    gindex = cudf.interval_range(start=start, end=end, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("freq", [1, 2, 3])
-@pytest.mark.parametrize("start", [0, 1, 2, 3, 5])
-@pytest.mark.parametrize("end", [6, 8, 10, 43, 70])
-def test_interval_range_freq_basic(start, end, freq, closed):
-    pindex = pd.interval_range(start=start, end=end, freq=freq, closed=closed)
-    gindex = cudf.interval_range(
-        start=start, end=end, freq=freq, closed=closed
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t):
-    start, end, freq = start_t(5), end_t(70), freq_t(3)
-    start_val = start.value if isinstance(start, cudf.Scalar) else start
-    end_val = end.value if isinstance(end, cudf.Scalar) else end
-    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
-    pindex = pd.interval_range(
-        start=start_val, end=end_val, freq=freq_val, closed="left"
-    )
-    gindex = cudf.interval_range(
-        start=start, end=end, freq=freq, closed="left"
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("periods", [1, 1.0, 2, 2.0, 3.0, 3])
-@pytest.mark.parametrize("start", [0, 0.0, 1.0, 1, 2, 2.0, 3.0, 3])
-@pytest.mark.parametrize("end", [4, 4.0, 5.0, 5, 6, 6.0, 7.0, 7])
-def test_interval_range_periods_basic(start, end, periods, closed):
-    pindex = pd.interval_range(
-        start=start, end=end, periods=periods, closed=closed
-    )
-    gindex = cudf.interval_range(
-        start=start, end=end, periods=periods, closed=closed
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t):
-    start, end, periods = start_t(0), end_t(4), periods_t(1.0)
-    start_val = start.value if isinstance(start, cudf.Scalar) else start
-    end_val = end.value if isinstance(end, cudf.Scalar) else end
-    periods_val = (
-        periods.value if isinstance(periods, cudf.Scalar) else periods
-    )
-    pindex = pd.interval_range(
-        start=start_val, end=end_val, periods=periods_val, closed="left"
-    )
-    gindex = cudf.interval_range(
-        start=start, end=end, periods=periods, closed="left"
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("periods", [1, 2, 3])
-@pytest.mark.parametrize("freq", [1, 2, 3, 4])
-@pytest.mark.parametrize("end", [4, 8, 9, 10])
-def test_interval_range_periods_freq_end(end, freq, periods, closed):
-    pindex = pd.interval_range(
-        end=end, freq=freq, periods=periods, closed=closed
-    )
-    gindex = cudf.interval_range(
-        end=end, freq=freq, periods=periods, closed=closed
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t):
-    periods, freq, end = periods_t(2), freq_t(3), end_t(10)
-    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
-    end_val = end.value if isinstance(end, cudf.Scalar) else end
-    periods_val = (
-        periods.value if isinstance(periods, cudf.Scalar) else periods
-    )
-    pindex = pd.interval_range(
-        end=end_val, freq=freq_val, periods=periods_val, closed="left"
-    )
-    gindex = cudf.interval_range(
-        end=end, freq=freq, periods=periods, closed="left"
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("periods", [1, 2, 3])
-@pytest.mark.parametrize("freq", [1, 2, 3, 4])
-@pytest.mark.parametrize("start", [1, 4, 9, 12])
-def test_interval_range_periods_freq_start(start, freq, periods, closed):
-    pindex = pd.interval_range(
-        start=start, freq=freq, periods=periods, closed=closed
-    )
-    gindex = cudf.interval_range(
-        start=start, freq=freq, periods=periods, closed=closed
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_periods_freq_start_dtype(periods_t, freq_t, start_t):
-    periods, freq, start = periods_t(2), freq_t(3), start_t(9)
-    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
-    start_val = start.value if isinstance(start, cudf.Scalar) else start
-    periods_val = (
-        periods.value if isinstance(periods, cudf.Scalar) else periods
-    )
-    pindex = pd.interval_range(
-        start=start_val, freq=freq_val, periods=periods_val, closed="left"
-    )
-    gindex = cudf.interval_range(
-        start=start, freq=freq, periods=periods, closed="left"
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
-@pytest.mark.parametrize(
-    "data",
-    [
-        ([pd.Interval(30, 50)]),
-        ([pd.Interval(0, 3), pd.Interval(1, 7)]),
-        ([pd.Interval(0.2, 60.3), pd.Interval(1, 7), pd.Interval(0, 0)]),
-        ([]),
-    ],
-)
-def test_interval_index_basic(data, closed):
-    pindex = pd.IntervalIndex(data, closed=closed)
-    gindex = IntervalIndex(data, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
-def test_interval_index_empty(closed):
-    pindex = pd.IntervalIndex([], closed=closed)
-    gindex = IntervalIndex([], closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
-@pytest.mark.parametrize(
-    "data",
-    [
-        ([pd.Interval(1, 6), pd.Interval(1, 10), pd.Interval(1, 3)]),
-        (
-            [
-                pd.Interval(3.5, 6.0),
-                pd.Interval(1.0, 7.0),
-                pd.Interval(0.0, 10.0),
-            ]
-        ),
-        (
-            [
-                pd.Interval(50, 100, closed="left"),
-                pd.Interval(1.0, 7.0, closed="left"),
-                pd.Interval(16, 322, closed="left"),
-            ]
-        ),
-        (
-            [
-                pd.Interval(50, 100, closed="right"),
-                pd.Interval(1.0, 7.0, closed="right"),
-                pd.Interval(16, 322, closed="right"),
-            ]
-        ),
-    ],
-)
-def test_interval_index_many_params(data, closed):
-    pindex = pd.IntervalIndex(data, closed=closed)
-    gindex = IntervalIndex(data, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-def test_interval_index_from_breaks(closed):
-    breaks = [0, 3, 6, 10]
-    pindex = pd.IntervalIndex.from_breaks(breaks, closed=closed)
-    gindex = IntervalIndex.from_breaks(breaks, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
 @pytest.mark.parametrize(
     "data",
     [

From 1050325fdbe9c474a22399c184ea6aa5119934c7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 11 Aug 2023 13:52:29 -0500
Subject: [PATCH 045/230] Fix an issue with fetching `NA` from a
 `TimedeltaColumn` (#13853)

Fixes: #13851

This PR fixes a mistake in `_get_np_scalar_from_timedelta64`, which has to return `cudf.NA` instead of `None`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13853
---
 python/cudf/cudf/_lib/scalar.pyx         | 4 ++--
 python/cudf/cudf/tests/test_timedelta.py | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index af63964bac3..0ff736b9204 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 cimport cython
 
@@ -536,7 +536,7 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
     cdef scalar* s_ptr = s.get()
 
     if not s_ptr[0].is_valid():
-        return None
+        return NA
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index ab45374c119..82ef309d116 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1437,3 +1437,8 @@ def test_timdelta_binop_tz_timestamp(op):
     date_tz_scalar = datetime.datetime.now(datetime.timezone.utc)
     with pytest.raises(NotImplementedError):
         op(s, date_tz_scalar)
+
+
+def test_timedelta_getitem_na():
+    s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]")
+    assert s[2] is cudf.NA

From 6a407cf0e235abdd465a29cf1321a750aae6ff46 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 11 Aug 2023 22:13:33 +0200
Subject: [PATCH 046/230] Fixes a performance regression in FST (#13850)

https://github.com/rapidsai/cudf/pull/13344 introduced a performance regression to the FST benchmarks that showed as much as a 35% performance degradation.

It seems that, after the refactor in the above PR, compiler optimization heuristics are deciding differently on loop unrolling in the part of the FST that's writing out transduced symbols.

As a fix, we are enforcing to not unroll that loop.

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13850
---
 cpp/src/io/fst/agent_dfa.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 52fd039c097..f867a95a864 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -92,6 +92,9 @@ class DFASimulationCallbackWrapper {
   {
     uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
     if (write) {
+#if __CUDA_ARCH__ > 0
+#pragma unroll 1
+#endif
       for (uint32_t out_char = 0; out_char < count; out_char++) {
         out_it[out_count + out_char] =
           transducer_table(old_state, symbol_id, out_char, read_symbol);

From bf9b1101c2cfd84f3f4d463331927090dc39ec9c Mon Sep 17 00:00:00 2001
From: Xinyu Zeng <xzeng56@wisc.edu>
Date: Sat, 12 Aug 2023 04:38:55 +0800
Subject: [PATCH 047/230] Fix typo in parquet/page_decode.cuh (#13849)

This PR fixes a typo in the parquet page decoding doc and adds a missing doc.

Authors:
  - Xinyu Zeng (https://github.com/XinyuZeng)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13849
---
 cpp/src/io/parquet/page_decode.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index af53a09e4db..4c4607150ce 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -443,8 +443,9 @@ __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
 /**
  * @brief Decode values out of a definition or repetition stream
  *
+ * @param[out] output Level buffer output
  * @param[in,out] s Page state input/output
- * @param[in] t target_count Target count of stream values on output
+ * @param[in] target_count Target count of stream values on output
  * @param[in] t Warp0 thread ID (0..31)
  * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
  */

From 989c4116a3b054f46a7f1152f234b2601d2583b0 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 11 Aug 2023 21:44:03 -0500
Subject: [PATCH 048/230] Disable construction of Index when `freq` is set in
 pandas-compatibility mode (#13857)

This PR raises an error when a `cudf` column/series/Index is being constructed using a pandas Index that has a `freq` set. This error is raised only in pandas-compatibility mode, because we will have to switch to `cudf.date_range` everywhere in the code base and examples, and `cudf.date_range` still isn't at a full feature parity with `pd.date_range`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/13857
---
 python/cudf/cudf/core/column/column.py | 6 ++++++
 python/cudf/cudf/tests/test_index.py   | 8 ++++++++
 2 files changed, 14 insertions(+)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 53dbb9c50cc..0b2b9fda2fd 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2237,6 +2237,12 @@ def as_column(
         raise NotImplementedError(
             "cuDF does not yet support timezone-aware datetimes"
         )
+    elif (
+        cudf.get_option("mode.pandas_compatible")
+        and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex))
+        and arbitrary.freq is not None
+    ):
+        raise NotImplementedError("freq is not implemented yet")
     else:
         try:
             data = as_column(
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 4efd7db4bc5..2da3d3d3ce1 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2670,3 +2670,11 @@ def test_index_mixed_dtype_error(data):
     pi = pd.Index(data)
     with pytest.raises(TypeError):
         cudf.Index(pi)
+
+
+@pytest.mark.parametrize("cls", [pd.DatetimeIndex, pd.TimedeltaIndex])
+def test_index_date_duration_freq_error(cls):
+    s = cls([1, 2, 3], freq="infer")
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(NotImplementedError):
+            cudf.Index(s)

From 65e572dc2ae8b302dd658ce66f85f476f1334775 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 14 Aug 2023 08:06:44 -0500
Subject: [PATCH 049/230] Fix return type of `MultiIndex.levels` (#13870)

Fixes: #13863

This PR fixes the return type of `MultiIndex.levels` to return `Index` objects.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13870
---
 python/cudf/cudf/core/multiindex.py       | 13 ++++---------
 python/cudf/cudf/tests/test_multiindex.py | 10 ++++++++++
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index bb0c25a9970..6e9d068ef50 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -619,14 +619,8 @@ def levels(self):
                     (3, 12)],
                 names=['a', 'b'])
         >>> midx.levels
-        [0    1
-        1    2
-        2    3
-        dtype: int64, 0    10
-        1    11
-        2    12
-        dtype: int64]
-        """
+        [Int64Index([1, 2, 3], dtype='int64', name='a'), Int64Index([10, 11, 12], dtype='int64', name='b')]
+        """  # noqa: E501
         if self._levels is None:
             self._compute_levels_and_codes()
         return self._levels
@@ -772,8 +766,9 @@ def _compute_levels_and_codes(self):
                 # `factorize` show up in other parts of public APIs.
                 warnings.simplefilter("ignore")
                 code, cats = cudf.Series._from_data({None: col}).factorize()
+            cats.name = name
             codes[name] = code.astype(np.int64)
-            levels.append(cudf.Series(cats, name=None))
+            levels.append(cats)
 
         self._levels = levels
         self._codes = cudf.DataFrame._from_data(codes)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 30960724bd6..a4099bb7f88 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1868,3 +1868,13 @@ def test_multiindex_index_single_row():
     gdf.index = idx
     pdf = gdf.to_pandas()
     assert_eq(pdf.loc[("b", 3)], gdf.loc[("b", 3)])
+
+
+def test_multiindex_levels():
+    gidx = cudf.MultiIndex.from_product(
+        [range(3), ["one", "two"]], names=["first", "second"]
+    )
+    pidx = gidx.to_pandas()
+
+    assert_eq(gidx.levels[0], pidx.levels[0])
+    assert_eq(gidx.levels[1], pidx.levels[1])

From 137ea685fa482827bb70101952db25393661157b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 14 Aug 2023 12:47:02 -0400
Subject: [PATCH 050/230] Update make_lists_column_from_scalar to use
 make_offsets_child_column utility (#13841)

Internal lists functions `make_lists_column_from_scalar` (used by `make_column_from_scalar`) and `generate_list_offsets_and_validities` (used by `concatenate_list_elements`) are updated to use the `make_offsets_child_column` utility to build the offsets from sizes. This utility handles `size_type` overflow when computing an offsets column in a consistent way (i.e. throwing `std::overflow_error` appropriately).

Closes #13833

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/13841
---
 cpp/include/cudf/lists/detail/gather.cuh      | 27 ++++++-------------
 .../combine/concatenate_list_elements.cu      | 23 ++++++----------
 cpp/src/lists/lists_column_factories.cu       | 12 +++------
 cpp/tests/column/factories_test.cpp           | 10 +++++++
 4 files changed, 30 insertions(+), 42 deletions(-)

diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 83710a49f6a..18fe707fd69 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -28,7 +29,6 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
-#include <thrust/transform_scan.h>
 
 namespace cudf {
 namespace lists {
@@ -74,25 +74,15 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 {
   // size of the gather map is the # of output rows
   size_type output_count = gather_map_size;
-  size_type offset_count = output_count + 1;
 
   // offsets of the source column
   int32_t const* src_offsets{source_column.offsets().data<int32_t>() + source_column.offset()};
   size_type const src_size = source_column.size();
 
-  // outgoing offsets.  these will persist as output from the entire gather operation
-  auto dst_offsets_c = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
-  mutable_column_view dst_offsets_v = dst_offsets_c->mutable_view();
   auto const source_column_nullmask = source_column.null_mask();
 
-  // generate the compacted outgoing offsets.
-  auto count_iter = thrust::make_counting_iterator<int32_t>(0);
-  thrust::transform_exclusive_scan(
-    rmm::exec_policy_nosync(stream),
-    count_iter,
-    count_iter + offset_count,
-    dst_offsets_v.begin<int32_t>(),
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0,
     [source_column_nullmask,
      source_column_offset = source_column.offset(),
      gather_map,
@@ -112,9 +102,10 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 
       // the length of this list
       return src_offsets[offset_index + 1] - src_offsets[offset_index];
-    },
-    0,
-    thrust::plus<int32_t>());
+    });
+
+  auto [dst_offsets_c, map_size] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + output_count, stream, mr);
 
   // handle sliced columns
   size_type const shift =
@@ -147,9 +138,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
     });
 
   // Retrieve size of the resulting gather map for level N+1 (the last offset)
-  size_type child_gather_map_size =
-    cudf::detail::get_value<size_type>(dst_offsets_c->view(), output_count, stream);
-
+  auto const child_gather_map_size = static_cast<size_type>(map_size);
   return {std::move(dst_offsets_c), std::move(base_offsets), child_gather_map_size};
 }
 
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 3b00d7bd26e..fbe297765f8 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -19,7 +19,9 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/lists_column_view.hpp>
@@ -120,12 +122,8 @@ generate_list_offsets_and_validities(column_view const& input,
 {
   auto const num_rows = input.size();
 
-  auto out_offsets = make_numeric_column(
-    data_type{type_to_id<size_type>()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
-
   auto const lists_of_lists_dv_ptr = column_device_view::create(input, stream);
   auto const lists_dv_ptr   = column_device_view::create(lists_column_view(input).child(), stream);
-  auto const d_out_offsets  = out_offsets->mutable_view().template begin<size_type>();
   auto const d_row_offsets  = lists_column_view(input).offsets_begin();
   auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
 
@@ -133,23 +131,19 @@ generate_list_offsets_and_validities(column_view const& input,
   auto validities = rmm::device_uvector<int8_t>(num_rows, stream);
 
   // Compute output list sizes and validities.
-  auto const iter = thrust::make_counting_iterator<size_type>(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + num_rows,
-    d_out_offsets,
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0,
     [lists_of_lists_dv = *lists_of_lists_dv_ptr,
      lists_dv          = *lists_dv_ptr,
      d_row_offsets,
      d_list_offsets,
-     d_validities = validities.begin(),
-     iter] __device__(auto const idx) {
+     d_validities = validities.begin()] __device__(auto const idx) {
       if (d_row_offsets[idx] == d_row_offsets[idx + 1]) {  // This is a null/empty row.
         d_validities[idx] = static_cast<int8_t>(lists_of_lists_dv.is_valid(idx));
         return size_type{0};
       }
       // The output row will not be null only if all lists on the input row are not null.
+      auto const iter = thrust::make_counting_iterator<size_type>(0);
       auto const is_valid =
         thrust::all_of(thrust::seq,
                        iter + d_row_offsets[idx],
@@ -161,10 +155,9 @@ generate_list_offsets_and_validities(column_view const& input,
       // Compute size of the output list as sum of sizes of all lists in the current input row.
       return d_list_offsets[d_row_offsets[idx + 1]] - d_list_offsets[d_row_offsets[idx]];
     });
-
   // Compute offsets from sizes.
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_out_offsets, d_out_offsets + num_rows + 1, d_out_offsets);
+  auto out_offsets = std::get<0>(
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + num_rows, stream, mr));
 
   return {std::move(out_offsets), std::move(validities)};
 }
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 7f82d32d327..278e5af07b2 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
 
@@ -49,14 +50,9 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
   auto mr_final = size == 1 ? mr : rmm::mr::get_current_device_resource();
 
   // Handcraft a 1-row column
-  auto offsets = make_numeric_column(
-    data_type{type_to_id<size_type>()}, 2, mask_state::UNALLOCATED, stream, mr_final);
-  auto m_offsets = offsets->mutable_view();
-  thrust::sequence(rmm::exec_policy(stream),
-                   m_offsets.begin<size_type>(),
-                   m_offsets.end<size_type>(),
-                   0,
-                   value.view().size());
+  auto sizes_itr = thrust::constant_iterator<size_type>(value.view().size());
+  auto offsets   = std::get<0>(
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + 1, stream, mr_final));
   size_type null_count = value.is_valid(stream) ? 0 : 1;
   auto null_mask_state = null_count ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
   auto null_mask       = cudf::detail::create_null_mask(1, null_mask_state, stream, mr_final);
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 66de4e19b27..95706ad9e37 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -819,3 +819,13 @@ void struct_from_scalar(bool is_valid)
 TEST_F(ColumnFactoryTest, FromStructScalar) { struct_from_scalar(true); }
 
 TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
+
+TEST_F(ColumnFactoryTest, FromScalarErrors)
+{
+  cudf::string_scalar ss("hello world");
+  EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
+
+  using FCW = cudf::test::fixed_width_column_wrapper<int8_t>;
+  auto s    = cudf::make_list_scalar(FCW({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  EXPECT_THROW(cudf::make_column_from_scalar(*s, 214748365), std::overflow_error);
+}

From 5a9241681b9290f233e4f5470c1d4a4b394ac24b Mon Sep 17 00:00:00 2001
From: Jeremy Dyer <jdye64@gmail.com>
Date: Mon, 14 Aug 2023 13:30:42 -0400
Subject: [PATCH 051/230] Add 'poll' function to custreamz kafka consumer
 (#13782)

Streamz has updated their codebase to include a call to the Confluent Kafka Consumer library function 'poll'. Currently custreamz does not include this method. This PR adds the 'poll' function to custreamz to simply proxy the call to the underlying confluent kafka library so that streamz is no longer broken for end users. Without this function end users are no longer able to use custreamz with newer versions of the streamz library.

This closes: #13600

Authors:
  - Jeremy Dyer (https://github.com/jdye64)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13782
---
 python/custreamz/custreamz/kafka.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py
index a88b05c83b1..0def0ba746e 100644
--- a/python/custreamz/custreamz/kafka.py
+++ b/python/custreamz/custreamz/kafka.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 import confluent_kafka as ck
 from cudf_kafka._lib.kafka import KafkaDatasource
 
@@ -25,6 +25,7 @@ def __init__(self, kafka_configs):
 
         self.kafka_configs = kafka_configs
         self.kafka_meta_client = KafkaDatasource(kafka_configs)
+        self.ck_consumer = ck.Consumer(kafka_configs)
 
     def list_topics(self, specific_topic=None):
         """
@@ -270,3 +271,21 @@ def commit(self, offsets=None, asynchronous=True):
             self.kafka_meta_client.commit_offset(
                 offs.topic.encode(), offs.partition, offs.offset
             )
+
+    def poll(self, timeout=None):
+        """
+        Consumes a single message, calls callbacks and returns events.
+
+        The application must check the returned Message object's
+        Message.error() method to distinguish between proper messages
+        (error() returns None), or an event or error
+        (see error().code() for specifics).
+
+        Parameters
+        ----------
+        timeout : float
+            Maximum time to block waiting for message, event or callback
+            (default: infinite (None translated into -1 in the
+            library)). (Seconds)
+        """
+        return self.ck.poll(timeout)

From b7682708adb1eb00c3a674d899cc3eade7f2d9b7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 14 Aug 2023 16:22:12 -0500
Subject: [PATCH 052/230] Change `NA` to `NaT` for `datetime` and `timedelta`
 types (#13868)

Resolves: #13867

This PR:

- [x] Changes all repr code of `datetime` & `timdelta` types to switch to represent null values as `NaT` from `<NA>`.
- [x] Changes Scalar getitem to return `cudf.NaT` instead of `cudf.NA`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13868
---
 python/cudf/cudf/__init__.py                  |   4 +-
 python/cudf/cudf/_lib/scalar.pyx              |  10 +-
 python/cudf/cudf/core/_internals/timezones.py |   2 +-
 python/cudf/cudf/core/_internals/where.py     |   5 +-
 python/cudf/cudf/core/column/column.py        |   3 +-
 python/cudf/cudf/core/dataframe.py            |  14 ++-
 python/cudf/cudf/core/index.py                |  14 ++-
 python/cudf/cudf/core/missing.py              |   6 +-
 python/cudf/cudf/core/multiindex.py           |   2 +-
 python/cudf/cudf/core/scalar.py               |  10 +-
 python/cudf/cudf/core/series.py               |  15 ++-
 python/cudf/cudf/testing/testing.py           |   4 +-
 python/cudf/cudf/tests/test_binops.py         |   7 +-
 python/cudf/cudf/tests/test_datetime.py       |   5 +
 python/cudf/cudf/tests/test_list.py           |   7 +-
 python/cudf/cudf/tests/test_parquet.py        |   2 +-
 python/cudf/cudf/tests/test_repr.py           | 106 +++++++-----------
 python/cudf/cudf/tests/test_scalar.py         |  16 ++-
 python/cudf/cudf/tests/test_timedelta.py      |   2 +-
 python/cudf/cudf/utils/dtypes.py              |   5 +-
 python/cudf/cudf/utils/utils.py               |   8 ++
 21 files changed, 144 insertions(+), 103 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index d8cee514fb7..e5c78fca893 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -58,7 +58,7 @@
     UInt64Index,
     interval_range,
 )
-from cudf.core.missing import NA
+from cudf.core.missing import NA, NaT
 from cudf.core.multiindex import MultiIndex
 from cudf.core.reshape import (
     concat,
@@ -90,7 +90,6 @@
     option_context,
     set_option,
 )
-from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import clear_cache
 
 cuda.set_memory_manager(RMMNumbaManager)
@@ -125,6 +124,7 @@
     "ListDtype",
     "MultiIndex",
     "NA",
+    "NaT",
     "RangeIndex",
     "Scalar",
     "Series",
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 0ff736b9204..39a1b0609cf 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -31,7 +31,7 @@ from cudf._lib.types import (
     duration_unit_map,
 )
 from cudf.core.dtypes import ListDtype, StructDtype
-from cudf.core.missing import NA
+from cudf.core.missing import NA, NaT
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -178,7 +178,7 @@ cdef class DeviceScalar:
         return self.get_raw_ptr()[0].is_valid()
 
     def __repr__(self):
-        if self.value is NA:
+        if cudf.utils.utils.is_na_like(self.value):
             return (
                 f"{self.__class__.__name__}"
                 f"({self.value}, {repr(self.dtype)})"
@@ -495,7 +495,7 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s):
     cdef scalar* s_ptr = s.get()
 
     if not s_ptr[0].is_valid():
-        return NA
+        return NaT
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
@@ -536,7 +536,7 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
     cdef scalar* s_ptr = s.get()
 
     if not s_ptr[0].is_valid():
-        return NA
+        return NaT
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
@@ -586,7 +586,7 @@ def as_device_scalar(val, dtype=None):
 
 
 def _is_null_host_scalar(slr):
-    if slr is None or slr is NA:
+    if cudf.utils.utils.is_na_like(slr):
         return True
     elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
         return True
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 2895fd3476c..67043d3fbb3 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -186,7 +186,7 @@ def localize(
         DatetimeColumn,
         data._scatter_by_column(
             data.isnull() | (ambiguous | nonexistent),
-            cudf.Scalar(cudf.NA, dtype=data.dtype),
+            cudf.Scalar(cudf.NaT, dtype=data.dtype),
         ),
     )
     gmt_data = local_to_utc(localized, zone_name)
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 6d4a2990e34..0f65861dc72 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import warnings
 from typing import Tuple, Union
@@ -13,7 +13,6 @@
     is_scalar,
 )
 from cudf.core.column import ColumnBase
-from cudf.core.missing import NA
 from cudf.utils.dtypes import (
     _can_cast,
     _dtype_can_hold_element,
@@ -59,7 +58,7 @@ def _check_and_cast_columns_with_other(
                 f"{type(other).__name__} to {source_dtype.name}"
             )
 
-        if other in {None, NA}:
+        if cudf.utils.utils.is_na_like(other):
             return _normalize_categorical(
                 source_col, cudf.Scalar(other, dtype=source_dtype)
             )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 0b2b9fda2fd..263da9b350e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -81,7 +81,6 @@
     ListDtype,
     StructDtype,
 )
-from cudf.core.missing import NA
 from cudf.core.mixins import BinaryOperand, Reducible
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
@@ -605,7 +604,7 @@ def __setitem__(self, key: Any, value: Any):
             self._mimic_inplace(out, inplace=True)
 
     def _wrap_binop_normalization(self, other):
-        if other is NA or other is None:
+        if cudf.utils.utils.is_na_like(other):
             return cudf.Scalar(other, dtype=self.dtype)
         if isinstance(other, np.ndarray) and other.ndim == 0:
             # Try and maintain the dtype
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e4b944a88af..fc624c0b8eb 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1694,7 +1694,19 @@ def _clean_nulls_from_dataframe(self, df):
                 # TODO we need to handle this
                 pass
             elif df._data[col].has_nulls():
-                df[col] = df._data[col].astype("str").fillna(cudf._NA_REP)
+                fill_value = (
+                    str(cudf.NaT)
+                    if isinstance(
+                        df._data[col],
+                        (
+                            cudf.core.column.DatetimeColumn,
+                            cudf.core.column.TimeDeltaColumn,
+                        ),
+                    )
+                    else str(cudf.NA)
+                )
+
+                df[col] = df._data[col].astype("str").fillna(fill_value)
             else:
                 df[col] = df._data[col]
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index c3d468e5656..b7ee85758b9 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1347,7 +1347,7 @@ def __repr__(self):
             else:
                 output = repr(preprocess.to_pandas())
 
-            output = output.replace("nan", cudf._NA_REP)
+            output = output.replace("nan", str(cudf.NA))
         elif preprocess._values.nullable:
             output = repr(self._clean_nulls_from_index().to_pandas())
 
@@ -1499,8 +1499,14 @@ def __contains__(self, item):
 
     def _clean_nulls_from_index(self):
         if self._values.has_nulls():
+            fill_value = (
+                str(cudf.NaT)
+                if isinstance(self, (DatetimeIndex, TimedeltaIndex))
+                else str(cudf.NA)
+            )
             return cudf.Index(
-                self._values.astype("str").fillna(cudf._NA_REP), name=self.name
+                self._values.astype("str").fillna(fill_value),
+                name=self.name,
             )
 
         return self
@@ -2611,7 +2617,7 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         ...                                   '2018-10-28 03:46:00']))
         >>> s.dt.tz_localize("CET")
         0    2018-10-28 01:20:00.000000000
-        1                             <NA>
+        1                              NaT
         2    2018-10-28 03:46:00.000000000
         dtype: datetime64[ns, CET]
 
@@ -3254,7 +3260,7 @@ def str(self):
 
     def _clean_nulls_from_index(self):
         if self._values.has_nulls():
-            return self.fillna(cudf._NA_REP)
+            return self.fillna(str(cudf.NA))
         else:
             return self
 
diff --git a/python/cudf/cudf/core/missing.py b/python/cudf/cudf/core/missing.py
index 02bcb7636f4..0d48a1d4136 100644
--- a/python/cudf/cudf/core/missing.py
+++ b/python/cudf/cudf/core/missing.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 
 # Pandas NAType enforces a single instance exists at a time
 # instantiating this class will yield the existing instance
 # of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA).
-from pandas import NA
+from pandas import NA, NaT
 
-__all__ = ["NA"]
+__all__ = ["NA", "NaT"]
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 6e9d068ef50..54c67458b55 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -504,7 +504,7 @@ def __repr__(self):
                     ),
                 ):
                     preprocess_df[name] = col.astype("str").fillna(
-                        cudf._NA_REP
+                        str(cudf.NaT)
                     )
 
             tuples_list = list(
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 438e3c7477d..a20628f6601 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -8,9 +8,9 @@
 import pyarrow as pa
 
 import cudf
-from cudf.api.types import is_scalar
+from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core.dtypes import ListDtype, StructDtype
-from cudf.core.missing import NA
+from cudf.core.missing import NA, NaT
 from cudf.core.mixins import BinaryOperand
 from cudf.utils.dtypes import (
     get_allowed_combinations_for_operator,
@@ -243,7 +243,11 @@ def _preprocess_host_value(self, value, dtype):
             dtype = cudf.dtype(dtype)
 
         if not valid:
-            value = NA
+            value = (
+                NaT
+                if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype)
+                else NA
+            )
 
         return value, dtype
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a0cec69eca9..b63261ef840 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1403,8 +1403,19 @@ def __repr__(self):
             preprocess._column,
             cudf.core.column.timedelta.TimeDeltaColumn,
         ):
+            fill_value = (
+                str(cudf.NaT)
+                if isinstance(
+                    preprocess._column,
+                    (
+                        cudf.core.column.TimeDeltaColumn,
+                        cudf.core.column.DatetimeColumn,
+                    ),
+                )
+                else str(cudf.NA)
+            )
             output = repr(
-                preprocess.astype("O").fillna(cudf._NA_REP).to_pandas()
+                preprocess.astype("str").fillna(fill_value).to_pandas()
             )
         elif isinstance(
             preprocess._column, cudf.core.column.CategoricalColumn
@@ -1436,7 +1447,7 @@ def __repr__(self):
                 min_rows=min_rows,
                 max_rows=max_rows,
                 length=show_dimensions,
-                na_rep=cudf._NA_REP,
+                na_rep=str(cudf.NA),
             )
         else:
             output = repr(preprocess.to_pandas())
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 484c013f774..a9c54ddcaa1 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -19,7 +19,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
-from cudf.core.missing import NA
+from cudf.core.missing import NA, NaT
 
 
 def dtype_can_compare_equal_to_other(dtype):
@@ -290,7 +290,7 @@ def assert_column_equal(
 
 
 def null_safe_scalar_equals(left, right):
-    if left in {NA, np.nan} or right in {NA, np.nan}:
+    if left in {NA, NaT, np.nan} or right in {NA, NaT, np.nan}:
         return left is right
     return left == right
 
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 65a15d7c567..549cd8da78e 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1700,7 +1700,12 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
     rhs = cudf.Scalar(cudf.NA, dtype=dtype_r)
 
     result = op(lhs, rhs)
-    assert result.value is cudf.NA
+    assert result.value is (
+        cudf.NaT
+        if cudf.api.types.is_datetime64_dtype(result.dtype)
+        or cudf.api.types.is_timedelta64_dtype(result.dtype)
+        else cudf.NA
+    )
 
     # make sure dtype is the same as had there been a valid scalar
     valid_lhs = cudf.Scalar(1, dtype=dtype_l)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index dcb8781e712..a59f8b62e7e 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2107,3 +2107,8 @@ def test_datetime_binop_tz_timestamp(op):
     date_scalar = datetime.datetime.now(datetime.timezone.utc)
     with pytest.raises(NotImplementedError):
         op(s, date_scalar)
+
+
+def test_datetime_getitem_na():
+    s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]")
+    assert s[2] is cudf.NaT
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 87a0424998f..5dd58d8a875 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -697,7 +697,12 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level):
         dtype = cudf.ListDtype(dtype)
 
     slr = cudf.Scalar(None, dtype=dtype)
-    assert slr.value is cudf.NA
+    assert slr.value is (
+        cudf.NaT
+        if cudf.api.types.is_datetime64_dtype(slr.dtype)
+        or cudf.api.types.is_timedelta64_dtype(slr.dtype)
+        else cudf.NA
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ff4c2e2a14d..a08ab211b8e 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2264,7 +2264,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
         pdf = pdf.drop(columns=["col_category", "col_bool"])
 
     if not add_nulls:
-        # Timedelta types convert NA to None when reading from parquet into
+        # Timedelta types convert NaT to None when reading from parquet into
         # pandas which interferes with series.max()/min()
         for t in TIMEDELTA_TYPES:
             pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index e7fa401f1ec..b944e0483d0 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -40,10 +40,7 @@ def test_null_series(nrows, dtype):
         ps = sr.to_pandas()
 
     pd.options.display.max_rows = int(nrows)
-    psrepr = repr(ps)
-    psrepr = psrepr.replace("NaN", "<NA>")
-    psrepr = psrepr.replace("NaT", "<NA>")
-    psrepr = psrepr.replace("None", "<NA>")
+    psrepr = repr(ps).replace("NaN", "<NA>").replace("None", "<NA>")
     if "UInt" in psrepr:
         psrepr = psrepr.replace("UInt", "uint")
     elif "Int" in psrepr:
@@ -71,12 +68,7 @@ def test_null_dataframe(ncols):
         gdf[dtype] = sr
     pdf = gdf.to_pandas()
     pd.options.display.max_columns = int(ncols)
-    pdf_repr = (
-        repr(pdf)
-        .replace("NaN", "<NA>")
-        .replace("NaT", "<NA>")
-        .replace("None", "<NA>")
-    )
+    pdf_repr = repr(pdf).replace("NaN", "<NA>").replace("None", "<NA>")
     assert pdf_repr.split() == repr(gdf).split()
     pd.reset_option("display.max_columns")
 
@@ -359,33 +351,33 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")),
             "DatetimeIndex([1970-01-01 00:00:00.000000010, "
             "1970-01-01 00:00:00.000000020,"
-            "\n       1970-01-01 00:00:00.000000030, <NA>],\n      "
+            "\n       1970-01-01 00:00:00.000000030, NaT],\n      "
             "dtype='datetime64[ns]')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")),
             "DatetimeIndex([1970-01-01 00:00:10, "
             "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n"
-            "       <NA>],\n      dtype='datetime64[s]')",
+            "       NaT],\n      dtype='datetime64[s]')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")),
             "DatetimeIndex([1970-01-01 00:00:00.000010, "
             "1970-01-01 00:00:00.000020,\n       "
-            "1970-01-01 00:00:00.000030, <NA>],\n      "
+            "1970-01-01 00:00:00.000030, NaT],\n      "
             "dtype='datetime64[us]')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")),
             "DatetimeIndex([1970-01-01 00:00:00.010, "
             "1970-01-01 00:00:00.020,\n       "
-            "1970-01-01 00:00:00.030, <NA>],\n      "
+            "1970-01-01 00:00:00.030, NaT],\n      "
             "dtype='datetime64[ms]')",
         ),
         (
             cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")),
-            "DatetimeIndex([<NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, "
-            "<NA>,\n       <NA>],\n      dtype='datetime64[ms]')",
+            "DatetimeIndex([NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, "
+            "NaT, NaT], dtype='datetime64[ms]')",
         ),
     ],
 )
@@ -473,12 +465,7 @@ def test_dataframe_null_index_repr(df, pandas_special_case):
     pdf = df
     gdf = cudf.from_pandas(pdf)
 
-    expected_repr = (
-        repr(pdf)
-        .replace("NaN", "<NA>")
-        .replace("NaT", "<NA>")
-        .replace("None", "<NA>")
-    )
+    expected_repr = repr(pdf).replace("NaN", "<NA>").replace("None", "<NA>")
     actual_repr = repr(gdf)
 
     if pandas_special_case:
@@ -552,12 +539,7 @@ def test_series_null_index_repr(sr, pandas_special_case):
     psr = sr
     gsr = cudf.from_pandas(psr)
 
-    expected_repr = (
-        repr(psr)
-        .replace("NaN", "<NA>")
-        .replace("NaT", "<NA>")
-        .replace("None", "<NA>")
-    )
+    expected_repr = repr(psr).replace("NaN", "<NA>").replace("None", "<NA>")
     actual_repr = repr(gsr)
 
     if pandas_special_case:
@@ -603,9 +585,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
     sr = cudf.Series(data, dtype=dtype)
     psr = sr.to_pandas()
 
-    expected = (
-        repr(psr).replace("timedelta64[ns]", dtype).replace("NaT", "<NA>")
-    )
+    expected = repr(psr).replace("timedelta64[ns]", dtype)
     actual = repr(sr)
 
     assert expected.split() == actual.split()
@@ -658,7 +638,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
                 """
             0    0 days 00:00:00.001000000
             1    0 days 00:00:00.000200000
-            2                         <NA>
+            2                          NaT
             dtype: timedelta64[ns]
             """
             ),
@@ -669,7 +649,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
                 """
             0    0 days 00:16:40
             1    0 days 00:03:20
-            2               <NA>
+            2                NaT
             dtype: timedelta64[ms]
             """
             ),
@@ -680,11 +660,11 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    <NA>
-            1    <NA>
-            2    <NA>
-            3    <NA>
-            4    <NA>
+            0    NaT
+            1    NaT
+            2    NaT
+            3    NaT
+            4    NaT
             dtype: timedelta64[ns]
             """
             ),
@@ -695,11 +675,11 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    <NA>
-            1    <NA>
-            2    <NA>
-            3    <NA>
-            4    <NA>
+            0    NaT
+            1    NaT
+            2    NaT
+            3    NaT
+            4    NaT
             dtype: timedelta64[ms]
             """
             ),
@@ -930,10 +910,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
                 """
                                      a   b
                 0   1579 days 08:54:14  10
-                1                 <NA>  11
+                1                  NaT  11
                 2   2839 days 15:29:05  22
                 3   2586 days 00:33:31  33
-                4                 <NA>  44
+                4                  NaT  44
                 5  42066 days 12:52:14  55
                 6      0 days 06:27:14  66
                 """
@@ -961,10 +941,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
                 """
                                      a
                 a   1579 days 08:54:14
-                b                 <NA>
+                b                  NaT
                 c   2839 days 15:29:05
                 d   2586 days 00:33:31
-                e                 <NA>
+                e                  NaT
                 f  42066 days 12:52:14
                 g      0 days 06:27:14
                 """
@@ -994,10 +974,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
                 """
                                       a
                 1 days 13:54:17.654   1
-                <NA>                  2
+                NaT                   2
                 2 days 20:09:05.345   3
                 2 days 14:03:52.411   4
-                <NA>                  5
+                NaT                   5
                 42 days 01:35:48.734  6
                 0 days 00:00:23.234   7
                 """
@@ -1027,10 +1007,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
                 """
                                     a
                 0 days 00:00:00.136457654  a
-                <NA>                f
+                NaT                 f
                 0 days 00:00:00.245345345  q
                 0 days 00:00:00.223432411  e
-                <NA>                w
+                NaT                 w
                 0 days 00:00:03.634548734  e
                 0 days 00:00:00.000023234  t
                 """
@@ -1057,7 +1037,7 @@ def test_timedelta_dataframe_repr(df, expected_repr):
             cudf.Index(
                 [None, None, None, None, None], dtype="timedelta64[us]"
             ),
-            "TimedeltaIndex([<NA>, <NA>, <NA>, <NA>, <NA>], "
+            "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], "
             "dtype='timedelta64[us]')",
         ),
         (
@@ -1073,9 +1053,9 @@ def test_timedelta_dataframe_repr(df, expected_repr):
                 ],
                 dtype="timedelta64[us]",
             ),
-            "TimedeltaIndex([0 days 00:02:16.457654, <NA>, "
+            "TimedeltaIndex([0 days 00:02:16.457654, NaT, "
             "0 days 00:04:05.345345, "
-            "0 days 00:03:43.432411, <NA>,"
+            "0 days 00:03:43.432411, NaT,"
             "       0 days 01:00:34.548734, 0 days 00:00:00.023234],"
             "      dtype='timedelta64[us]')",
         ),
@@ -1092,8 +1072,8 @@ def test_timedelta_dataframe_repr(df, expected_repr):
                 ],
                 dtype="timedelta64[s]",
             ),
-            "TimedeltaIndex([1579 days 08:54:14, <NA>, 2839 days 15:29:05,"
-            "       2586 days 00:33:31, <NA>, 42066 days 12:52:14, "
+            "TimedeltaIndex([1579 days 08:54:14, NaT, 2839 days 15:29:05,"
+            "       2586 days 00:33:31, NaT, 42066 days 12:52:14, "
             "0 days 06:27:14],"
             "      dtype='timedelta64[s]')",
         ),
@@ -1190,7 +1170,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-            MultiIndex([(                         '<NA>', 'abc'),
+            MultiIndex([(                          'NaT', 'abc'),
                         ('1970-01-01 00:00:00.000000001',  <NA>),
                         ('1970-01-01 00:00:00.000000002', 'xyz'),
                         ('1970-01-01 00:00:00.000000003',  <NA>)],
@@ -1210,7 +1190,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-                MultiIndex([(                         '<NA>', 'abc', 0.345),
+                MultiIndex([(                          'NaT', 'abc', 0.345),
                             ('1970-01-01 00:00:00.000000001',  <NA>,  <NA>),
                             ('1970-01-01 00:00:00.000000002', 'xyz', 100.0),
                             ('1970-01-01 00:00:00.000000003',  <NA>,  10.0)],
@@ -1230,7 +1210,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-                MultiIndex([('abc',                      '<NA>', 0.345),
+                MultiIndex([('abc',                       'NaT', 0.345),
                             ( <NA>, '0 days 00:00:00.000000001',  <NA>),
                             ('xyz', '0 days 00:00:00.000000002', 100.0),
                             ( <NA>, '0 days 00:00:00.000000003',  10.0)],
@@ -1272,10 +1252,10 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-            MultiIndex([('<NA>', <NA>),
-                        ('<NA>', <NA>),
-                        ('<NA>', <NA>),
-                        ('<NA>', <NA>)],
+            MultiIndex([('NaT', <NA>),
+                        ('NaT', <NA>),
+                        ('NaT', <NA>),
+                        ('NaT', <NA>)],
                     names=['b', 'a'])
             """
             ),
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index c1aeb987eff..d73a1d40aaa 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -212,7 +212,12 @@ def test_scalar_roundtrip(value):
 )
 def test_null_scalar(dtype):
     s = cudf.Scalar(None, dtype=dtype)
-    assert s.value is cudf.NA
+    if cudf.api.types.is_datetime64_dtype(
+        dtype
+    ) or cudf.api.types.is_timedelta64_dtype(dtype):
+        assert s.value is cudf.NaT
+    else:
+        assert s.value is cudf.NA
     assert s.dtype == (
         cudf.dtype(dtype)
         if not isinstance(dtype, cudf.core.dtypes.DecimalDtype)
@@ -236,7 +241,7 @@ def test_null_scalar(dtype):
 )
 def test_nat_to_null_scalar_succeeds(value):
     s = cudf.Scalar(value)
-    assert s.value is cudf.NA
+    assert s.value is cudf.NaT
     assert not s.is_valid()
     assert s.dtype == value.dtype
 
@@ -349,7 +354,12 @@ def test_scalar_implicit_int_conversion(value):
 def test_scalar_invalid_implicit_conversion(cls, dtype):
 
     try:
-        cls(pd.NA)
+        cls(
+            pd.NaT
+            if cudf.api.types.is_datetime64_dtype(dtype)
+            or cudf.api.types.is_timedelta64_dtype(dtype)
+            else pd.NA
+        )
     except TypeError as e:
         with pytest.raises(TypeError, match=re.escape(str(e))):
             slr = cudf.Scalar(None, dtype=dtype)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 82ef309d116..681b42fd3bd 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1441,4 +1441,4 @@ def test_timdelta_binop_tz_timestamp(op):
 
 def test_timedelta_getitem_na():
     s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]")
-    assert s[2] is cudf.NA
+    assert s[2] is cudf.NaT
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index d5e9e5854df..ea96a0859ce 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -13,9 +13,6 @@
 import cudf
 from cudf._typing import DtypeObj
 from cudf.api.types import is_bool, is_float, is_integer
-from cudf.core.missing import NA
-
-_NA_REP = "<NA>"
 
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
@@ -639,7 +636,7 @@ def _can_cast(from_dtype, to_dtype):
     `np.can_cast` but with some special handling around
     cudf specific dtypes.
     """
-    if from_dtype in {None, NA}:
+    if cudf.utils.utils.is_na_like(from_dtype):
         return True
     if isinstance(from_dtype, type):
         from_dtype = cudf.dtype(from_dtype)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index dadd9aa2cc4..e2cb3f145a1 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -345,6 +345,14 @@ def search_range(x: int, ri: range, *, side: str) -> int:
     return max(min(len(ri), offset), 0)
 
 
+def is_na_like(obj):
+    """
+    Check if `obj` is a cudf NA value,
+    i.e., None, cudf.NA or cudf.NaT
+    """
+    return obj is None or obj is cudf.NA or obj is cudf.NaT
+
+
 def _get_color_for_nvtx(name):
     m = hashlib.sha256()
     m.update(name.encode())

From 582d3109b55ca1721b3ddb042d5282a6eef9b8b8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 14 Aug 2023 16:34:36 -0500
Subject: [PATCH 053/230] Raise error when `astype(object)` is called in pandas
 compatibility mode (#13862)

This PR disables type-casting to `object` dtype for non-string columns. It's a no-op for string columns. We disable this only in pandas-compatibility mode since the usage of `np.dtype("O")` is wide-spread and not quite possible to remove it's support entirely at this point.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13862
---
 python/cudf/cudf/core/column/column.py |  7 +++++++
 python/cudf/cudf/tests/test_series.py  | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 263da9b350e..bd0e051c4b7 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -989,6 +989,13 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
             np.object_,
             str,
         }:
+            if cudf.get_option("mode.pandas_compatible") and np.dtype(
+                dtype
+            ).type in {np.object_}:
+                raise ValueError(
+                    f"Casting to {dtype} is not supported, use "
+                    "`.astype('str')` instead."
+                )
             return self.as_string_column(dtype, **kwargs)
         elif is_list_dtype(dtype):
             if not self.dtype == dtype:
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6b009d7e913..2c4befb8393 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2224,3 +2224,23 @@ def __getitem__(self, key):
 def test_series_constructor_error_mixed_type():
     with pytest.raises(pa.ArrowTypeError):
         cudf.Series(["abc", np.nan, "123"], nan_as_null=False)
+
+
+def test_series_typecast_to_object_error():
+    actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(ValueError):
+            actual.astype(object)
+        with pytest.raises(ValueError):
+            actual.astype(np.dtype("object"))
+        new_series = actual.astype("str")
+        assert new_series[0] == "1970-01-01 00:00:00.000000001"
+
+
+def test_series_typecast_to_object():
+    actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
+    with cudf.option_context("mode.pandas_compatible", False):
+        new_series = actual.astype(object)
+        assert new_series[0] == "1970-01-01 00:00:00.000000001"
+        new_series = actual.astype(np.dtype("object"))
+        assert new_series[0] == "1970-01-01 00:00:00.000000001"

From 049248ddf57f4889ca78923c6d891a7abb658dc9 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 14 Aug 2023 18:14:22 -0500
Subject: [PATCH 054/230] Bring parity with pandas for `datetime` & `timedelta`
 comparison operations (#13877)

Resolves: #13876

This PR brings parity with pandas `datetime` & `timedelta` comparison operations by filling the null locations of results with `True`/`False` according to the binary operation.

**Note: This is only enabled in pandas-compatibility mode.**

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13877
---
 python/cudf/cudf/core/column/datetime.py  | 12 +++++++++--
 python/cudf/cudf/core/column/timedelta.py | 14 +++++++++---
 python/cudf/cudf/tests/test_datetime.py   | 21 ++++++++++++++++++
 python/cudf/cudf/tests/test_timedelta.py  | 26 +++++++++++++++++++++++
 4 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 8073092775d..da6c4fb858c 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -460,14 +460,22 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             if isinstance(other, ColumnBase) and not isinstance(
                 other, DatetimeColumn
             ):
-                return _all_bools_with_nulls(
+                result = _all_bools_with_nulls(
                     self, other, bool_fill_value=op == "__ne__"
                 )
+                if cudf.get_option("mode.pandas_compatible"):
+                    result = result.fillna(op == "__ne__")
+                return result
 
         if out_dtype is None:
             return NotImplemented
 
-        return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if cudf.get_option(
+            "mode.pandas_compatible"
+        ) and out_dtype == cudf.dtype(np.bool_):
+            result = result.fillna(op == "__ne__")
+        return result
 
     def fillna(
         self,
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index b571461b307..e254761e9ec 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -181,7 +181,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 "__ge__",
                 "NULL_EQUALS",
             }:
-                out_dtype = np.bool_
+                out_dtype = cudf.dtype(np.bool_)
             elif op == "__mod__":
                 out_dtype = determine_out_dtype(self.dtype, other.dtype)
             elif op in {"__truediv__", "__floordiv__"}:
@@ -206,16 +206,24 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 if isinstance(other, ColumnBase) and not isinstance(
                     other, TimeDeltaColumn
                 ):
-                    return _all_bools_with_nulls(
+                    result = _all_bools_with_nulls(
                         self, other, bool_fill_value=op == "__ne__"
                     )
+                    if cudf.get_option("mode.pandas_compatible"):
+                        result = result.fillna(op == "__ne__")
+                    return result
 
         if out_dtype is None:
             return NotImplemented
 
         lhs, rhs = (other, this) if reflect else (this, other)
 
-        return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if cudf.get_option(
+            "mode.pandas_compatible"
+        ) and out_dtype == cudf.dtype(np.bool_):
+            result = result.fillna(op == "__ne__")
+        return result
 
     def normalize_binop_value(self, other) -> ColumnBinaryOperand:
         if isinstance(other, (ColumnBase, cudf.Scalar)):
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index a59f8b62e7e..4c4657ccba1 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2109,6 +2109,27 @@ def test_datetime_binop_tz_timestamp(op):
         op(s, date_scalar)
 
 
+@pytest.mark.parametrize(
+    "data1", [["20110101", "20120101", None, "20140101", None]]
+)
+@pytest.mark.parametrize(
+    "data2", [["20110101", "20120101", "20130101", None, None]]
+)
+@pytest.mark.parametrize("op", _cmpops)
+def test_datetime_series_cmpops_pandas_compatibility(data1, data2, op):
+    gsr1 = cudf.Series(data=data1, dtype="datetime64[ns]")
+    psr1 = gsr1.to_pandas()
+
+    gsr2 = cudf.Series(data=data2, dtype="datetime64[ns]")
+    psr2 = gsr2.to_pandas()
+
+    expect = op(psr1, psr2)
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = op(gsr1, gsr2)
+
+    assert_eq(expect, got)
+
+
 def test_datetime_getitem_na():
     s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]")
     assert s[2] is cudf.NaT
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 681b42fd3bd..ef39a4fef5a 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -57,6 +57,15 @@
     [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234],
 ]
 
+_cmpops = [
+    operator.lt,
+    operator.gt,
+    operator.le,
+    operator.ge,
+    operator.eq,
+    operator.ne,
+]
+
 
 @pytest.mark.parametrize(
     "data",
@@ -1442,3 +1451,20 @@ def test_timdelta_binop_tz_timestamp(op):
 def test_timedelta_getitem_na():
     s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]")
     assert s[2] is cudf.NaT
+
+
+@pytest.mark.parametrize("data1", [[123, 456, None, 321, None]])
+@pytest.mark.parametrize("data2", [[123, 456, 789, None, None]])
+@pytest.mark.parametrize("op", _cmpops)
+def test_timedelta_series_cmpops_pandas_compatibility(data1, data2, op):
+    gsr1 = cudf.Series(data=data1, dtype="timedelta64[ns]")
+    psr1 = gsr1.to_pandas()
+
+    gsr2 = cudf.Series(data=data2, dtype="timedelta64[ns]")
+    psr2 = gsr2.to_pandas()
+
+    expect = op(psr1, psr2)
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = op(gsr1, gsr2)
+
+    assert_eq(expect, got)

From 8b72662abf8b044a176fa9aac082c1834686ef83 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 14 Aug 2023 18:29:42 -0500
Subject: [PATCH 055/230] Check for the presence of all values in
 `MultiIndex.isin` (#13879)

Fixes: #13861

This PR fixes an issue in `MulitIndex.isin` where, previously we calculated partial matches. With this PR we will only return `True` for complete matches.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13879
---
 python/cudf/cudf/core/multiindex.py  | 18 ++++++------------
 python/cudf/cudf/tests/test_index.py |  4 ++++
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 54c67458b55..5ab9af36175 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -730,18 +730,12 @@ def isin(self, values, level=None):
                 values_idx = cudf.MultiIndex.from_tuples(
                     values, names=self.names
                 )
-
-            res = []
-            for name in self.names:
-                level_idx = self.get_level_values(name)
-                value_idx = values_idx.get_level_values(name)
-
-                existence = level_idx.isin(value_idx)
-                res.append(existence)
-
-            result = res[0]
-            for i in res[1:]:
-                result = result & i
+            self_df = self.to_frame(index=False).reset_index()
+            values_df = values_idx.to_frame(index=False)
+            idx = self_df.merge(values_df)._data["index"]
+            res = cudf.core.column.full(size=len(self), fill_value=False)
+            res[idx] = True
+            result = res.values
         else:
             level_series = self.get_level_values(level)
             result = level_series.isin(values)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 2da3d3d3ce1..aefaacdac29 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2235,11 +2235,15 @@ def test_isin_index(data, values):
             [[1, 2, 3, 10, 100], ["red", "blue", "green", "pink", "white"]],
             names=("number", "color"),
         ),
+        pd.MultiIndex.from_product(
+            [[0, 1], ["red", "blue", "green"]], names=("number", "color")
+        ),
     ],
 )
 @pytest.mark.parametrize(
     "values,level,err",
     [
+        ([(1, "red"), (2, "blue"), (0, "green")], None, None),
         (["red", "orange", "yellow"], "color", None),
         (["red", "white", "yellow"], "color", None),
         ([0, 1, 2, 10, 11, 15], "number", None),

From 1d58d5f4744e191af1044d66ff06a2c62b79ab5e Mon Sep 17 00:00:00 2001
From: Liangcai Li <liangcail@nvidia.com>
Date: Tue, 15 Aug 2023 12:26:10 +0800
Subject: [PATCH 056/230] Exclude some tests from running with the compute
 sanitizer (#13872)

Authors:
  - Liangcai Li (https://github.com/firestarman)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/13872
---
 java/src/test/java/ai/rapids/cudf/CudaTest.java     |  2 ++
 java/src/test/java/ai/rapids/cudf/RmmTest.java      | 13 ++++++++++++-
 java/src/test/java/ai/rapids/cudf/TableTest.java    |  4 ++++
 .../ai/rapids/cudf/UnsafeMemoryAccessorTest.java    |  4 +++-
 4 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index e29bf1a672e..2edd7f36cb7 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -16,6 +16,7 @@
 
 package ai.rapids.cudf;
 
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -33,6 +34,7 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testCudaException() {
     assertThrows(CudaException.class, () -> {
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index 352f17e6174..cd53cf7068a 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
@@ -187,6 +188,7 @@ public void testScopedMaxOutstandingNegative(int rmmAllocMode) {
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,
@@ -257,6 +259,7 @@ public void onDeallocated(long sizeDeallocated) {
     assertEquals(1024, totalDeallocated.get());
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testSetEventHandlerTwice() {
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0L);
@@ -278,6 +281,7 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) {
     assertThrows(RmmException.class, () -> Rmm.setEventHandler(otherHandler));
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testClearEventHandler() {
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0L);
@@ -304,6 +308,7 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) {
     }
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testAllocOnlyThresholds() {
     final AtomicInteger allocInvocations = new AtomicInteger(0);
@@ -367,6 +372,7 @@ public void onDeallocThreshold(long totalAllocSize) {
     assertEquals(0, deallocInvocations.get());
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testThresholds() {
     final AtomicInteger allocInvocations = new AtomicInteger(0);
@@ -451,6 +457,7 @@ public void onDeallocThreshold(long totalAllocSize) {
     assertEquals(2, deallocInvocations.get());
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testExceptionHandling() {
     Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024L);
@@ -511,6 +518,7 @@ public void testThreadAutoDeviceSetup() throws Exception {
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,
@@ -523,6 +531,7 @@ public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) {
     Cuda.autoSetDevice();
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testPoolSize() {
     Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024);
@@ -535,6 +544,7 @@ public void testPoolSize() {
     }
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testCudaAsyncMemoryResourceSize() {
     try {
@@ -553,6 +563,7 @@ public void testCudaAsyncMemoryResourceSize() {
     }
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testCudaAsyncIsIncompatibleWithManaged() {
     assertThrows(IllegalArgumentException.class,
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 94de3c6a11c..f17197ef608 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -40,6 +40,7 @@
 import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.OriginalType;
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import java.io.*;
@@ -8602,6 +8603,9 @@ void testORCWriteToFileWithColNames() throws IOException {
     }
   }
 
+  // https://github.com/NVIDIA/spark-rapids-jni/issues/1338
+  // Need to remove this tag if #1338 is fixed.
+  @Tag("noSanitizer")
   @Test
   void testORCReadAndWriteForDecimal128() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
diff --git a/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java b/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
index b7cadb2786a..a3684cb42b9 100644
--- a/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,11 +18,13 @@
 
 package ai.rapids.cudf;
 
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotEquals;
 
+@Tag("noSanitizer")
 public class UnsafeMemoryAccessorTest {
   @Test
   public void testAllocate() {

From da2560fe6fa042b8dc44b6da995083f500e52847 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 15 Aug 2023 14:43:13 -0700
Subject: [PATCH 057/230] Optionally write version 2 page headers in Parquet
 writer (#13751)

Part of #13501. This adds the ability to write V2 page headers to the Parquet writer.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13751
---
 cpp/include/cudf/io/parquet.hpp               |  48 +++++
 cpp/include/cudf_test/base_fixture.hpp        |  21 ++
 cpp/src/io/functions.cpp                      |  13 ++
 .../io/parquet/compact_protocol_reader.cpp    |  15 +-
 .../io/parquet/compact_protocol_reader.hpp    |   1 +
 cpp/src/io/parquet/page_enc.cu                | 117 ++++++++---
 cpp/src/io/parquet/parquet.hpp                |  15 ++
 cpp/src/io/parquet/parquet_gpu.hpp            |   5 +
 cpp/src/io/parquet/writer_impl.cu             |  25 ++-
 cpp/src/io/parquet/writer_impl.hpp            |   1 +
 cpp/tests/io/parquet_test.cpp                 | 188 +++++++++++++++---
 11 files changed, 388 insertions(+), 61 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ac7f378a25d..788ff15f3c1 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -528,6 +528,8 @@ class parquet_writer_options {
   std::optional<size_type> _max_page_fragment_size;
   // Optional compression statistics
   std::shared_ptr<writer_compression_statistics> _compression_stats;
+  // write V2 page headers?
+  bool _v2_page_headers = false;
 
   /**
    * @brief Constructor from sink and table.
@@ -712,6 +714,13 @@ class parquet_writer_options {
     return _compression_stats;
   }
 
+  /**
+   * @brief Returns `true` if V2 page headers should be written.
+   *
+   * @return `true` if V2 page headers should be written.
+   */
+  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
+
   /**
    * @brief Sets partitions.
    *
@@ -829,6 +838,13 @@ class parquet_writer_options {
   {
     _compression_stats = std::move(comp_stats);
   }
+
+  /**
+   * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of V2 page headers.
+   */
+  void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
 };
 
 /**
@@ -1060,6 +1076,14 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if V2 page headers are to be written.
+   *
+   * @param enabled Boolean value to enable/disable writing of V2 page headers.
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& write_v2_headers(bool enabled);
+
   /**
    * @brief move parquet_writer_options member once it's built.
    */
@@ -1141,6 +1165,8 @@ class chunked_parquet_writer_options {
   std::optional<size_type> _max_page_fragment_size;
   // Optional compression statistics
   std::shared_ptr<writer_compression_statistics> _compression_stats;
+  // write V2 page headers?
+  bool _v2_page_headers = false;
 
   /**
    * @brief Constructor from sink.
@@ -1281,6 +1307,13 @@ class chunked_parquet_writer_options {
     return _compression_stats;
   }
 
+  /**
+   * @brief Returns `true` if V2 page headers should be written.
+   *
+   * @return `true` if V2 page headers should be written.
+   */
+  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
+
   /**
    * @brief Sets metadata.
    *
@@ -1384,6 +1417,13 @@ class chunked_parquet_writer_options {
     _compression_stats = std::move(comp_stats);
   }
 
+  /**
+   * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of V2 page headers.
+   */
+  void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
+
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
    *
@@ -1475,6 +1515,14 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if V2 page headers are to be written.
+   *
+   * @param enabled Boolean value to enable/disable writing of V2 page headers.
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& write_v2_headers(bool enabled);
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 364355438fd..05319e03003 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -59,6 +59,27 @@ class BaseFixture : public ::testing::Test {
   rmm::mr::device_memory_resource* mr() { return _mr; }
 };
 
+/**
+ * @brief Base test fixture that takes a parameter.
+ *
+ * Example:
+ * ```
+ * class MyIntTestFixture : public cudf::test::BaseFixtureWithParam<int> {};
+ * ```
+ */
+template <typename T>
+class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
+  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+
+ public:
+  /**
+   * @brief Returns pointer to `device_memory_resource` that should be used for
+   * all tests inheriting from this fixture
+   * @return pointer to memory resource
+   */
+  rmm::mr::device_memory_resource* mr() const { return _mr; }
+};
+
 template <typename T, typename Enable = void>
 struct uniform_distribution_impl {};
 template <typename T>
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index f0df650c79e..5adb2046dbd 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -750,6 +750,12 @@ parquet_writer_options_builder& parquet_writer_options_builder::max_page_fragmen
   return *this;
 }
 
+parquet_writer_options_builder& parquet_writer_options_builder::write_v2_headers(bool enabled)
+{
+  options.enable_write_v2_headers(enabled);
+  return *this;
+}
+
 void chunked_parquet_writer_options::set_key_value_metadata(
   std::vector<std::map<std::string, std::string>> metadata)
 {
@@ -831,6 +837,13 @@ chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::
   return *this;
 }
 
+chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::write_v2_headers(
+  bool enabled)
+{
+  options.enable_write_v2_headers(enabled);
+  return *this;
+}
+
 chunked_parquet_writer_options_builder&
 chunked_parquet_writer_options_builder::max_page_fragment_size(size_type val)
 {
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 789c76a860c..92fcd151925 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -255,7 +255,8 @@ bool CompactProtocolReader::read(PageHeader* p)
                             ParquetFieldInt32(2, p->uncompressed_page_size),
                             ParquetFieldInt32(3, p->compressed_page_size),
                             ParquetFieldStruct(5, p->data_page_header),
-                            ParquetFieldStruct(7, p->dictionary_page_header));
+                            ParquetFieldStruct(7, p->dictionary_page_header),
+                            ParquetFieldStruct(8, p->data_page_header_v2));
   return function_builder(this, op);
 }
 
@@ -275,6 +276,18 @@ bool CompactProtocolReader::read(DictionaryPageHeader* d)
   return function_builder(this, op);
 }
 
+bool CompactProtocolReader::read(DataPageHeaderV2* d)
+{
+  auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
+                            ParquetFieldInt32(2, d->num_nulls),
+                            ParquetFieldInt32(3, d->num_rows),
+                            ParquetFieldEnum<Encoding>(4, d->encoding),
+                            ParquetFieldInt32(5, d->definition_levels_byte_length),
+                            ParquetFieldInt32(6, d->repetition_levels_byte_length),
+                            ParquetFieldBool(7, d->is_compressed));
+  return function_builder(this, op);
+}
+
 bool CompactProtocolReader::read(KeyValue* k)
 {
   auto op = std::make_tuple(ParquetFieldString(1, k->key), ParquetFieldString(2, k->value));
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 453a4be9b83..62ccacaac37 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -114,6 +114,7 @@ class CompactProtocolReader {
   bool read(PageHeader* p);
   bool read(DataPageHeader* d);
   bool read(DictionaryPageHeader* d);
+  bool read(DataPageHeaderV2* d);
   bool read(KeyValue* k);
   bool read(PageLocation* p);
   bool read(OffsetIndex* o);
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 190f70d0747..9f4c0ba943a 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -309,7 +309,8 @@ __global__ void __launch_bounds__(128)
                int32_t num_columns,
                size_t max_page_size_bytes,
                size_type max_page_size_rows,
-               uint32_t page_align)
+               uint32_t page_align,
+               bool write_v2_headers)
 {
   // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach
   __shared__ __align__(8) parquet_column_device_view col_g;
@@ -318,7 +319,8 @@ __global__ void __launch_bounds__(128)
   __shared__ __align__(8) EncPage page_g;
   __shared__ __align__(8) statistics_merge_group pagestats_g;
 
-  uint32_t t = threadIdx.x;
+  uint32_t const t          = threadIdx.x;
+  auto const data_page_type = write_v2_headers ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;
 
   if (t == 0) {
     col_g  = col_desc[blockIdx.x];
@@ -449,7 +451,7 @@ __global__ void __launch_bounds__(128)
           page_g.num_fragments = fragments_in_chunk - page_start;
           page_g.chunk         = &chunks[blockIdx.y][blockIdx.x];
           page_g.chunk_id      = blockIdx.y * num_columns + blockIdx.x;
-          page_g.page_type     = PageType::DATA_PAGE;
+          page_g.page_type     = data_page_type;
           page_g.hdr_size      = 0;
           page_g.max_hdr_size  = 32;  // Max size excluding statistics
           if (ck_g.stats) {
@@ -968,8 +970,12 @@ __global__ void __launch_bounds__(128, 8)
                  device_span<compression_result> comp_results)
 {
   __shared__ __align__(8) page_enc_state_s state_g;
-  using block_scan = cub::BlockScan<uint32_t, block_size>;
-  __shared__ typename block_scan::TempStorage temp_storage;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
 
   page_enc_state_s* const s = &state_g;
   uint32_t t                = threadIdx.x;
@@ -980,9 +986,15 @@ __global__ void __launch_bounds__(128, 8)
     s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
     s->cur  = s->page.page_data + s->page.max_hdr_size;
+    // init V2 info
+    s->page.def_lvl_bytes = 0;
+    s->page.rep_lvl_bytes = 0;
+    s->page.num_nulls     = 0;
   }
   __syncthreads();
 
+  auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2;
+
   // Encode Repetition and Definition levels
   if (s->page.page_type != PageType::DICTIONARY_PAGE &&
       (s->col.num_def_level_bits()) != 0 &&  // This means max definition level is not 0 (nullable)
@@ -995,7 +1007,10 @@ __global__ void __launch_bounds__(128, 8)
         s->rle_run     = 0;
         s->rle_pos     = 0;
         s->rle_numvals = 0;
-        s->rle_out     = s->cur + 4;
+        s->rle_out     = s->cur;
+        if (not is_v2) {
+          s->rle_out += 4;  // save space for length
+        }
       }
       __syncthreads();
       while (s->rle_numvals < s->page.num_rows) {
@@ -1037,11 +1052,13 @@ __global__ void __launch_bounds__(128, 8)
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* cur     = s->cur;
-        uint8_t* rle_out = s->rle_out;
-        if (t < 4) {
-          uint32_t rle_bytes = (uint32_t)(rle_out - cur) - 4;
-          cur[t]             = rle_bytes >> (t * 8);
+        uint8_t* const cur       = s->cur;
+        uint8_t* const rle_out   = s->rle_out;
+        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
+        if (is_v2 && t == 0) {
+          s->page.def_lvl_bytes = rle_bytes;
+        } else if (not is_v2 && t < 4) {
+          cur[t] = rle_bytes >> (t * 8);
         }
         __syncwarp();
         if (t == 0) { s->cur = rle_out; }
@@ -1050,14 +1067,17 @@ __global__ void __launch_bounds__(128, 8)
   } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
              s->col.num_rep_level_bits() != 0  // This means there ARE repetition levels (has list)
   ) {
-    auto encode_levels = [&](uint8_t const* lvl_val_data, uint32_t nbits) {
+    auto encode_levels = [&](uint8_t const* lvl_val_data, uint32_t nbits, uint32_t& lvl_bytes) {
       // For list types, the repetition and definition levels are pre-calculated. We just need to
       // encode and write them now.
       if (!t) {
         s->rle_run     = 0;
         s->rle_pos     = 0;
         s->rle_numvals = 0;
-        s->rle_out     = s->cur + 4;
+        s->rle_out     = s->cur;
+        if (not is_v2) {
+          s->rle_out += 4;  // save space for length
+        }
       }
       __syncthreads();
       size_type page_first_val_idx = s->col.level_offsets[s->page.start_row];
@@ -1075,19 +1095,21 @@ __global__ void __launch_bounds__(128, 8)
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* cur     = s->cur;
-        uint8_t* rle_out = s->rle_out;
-        if (t < 4) {
-          uint32_t rle_bytes = (uint32_t)(rle_out - cur) - 4;
-          cur[t]             = rle_bytes >> (t * 8);
+        uint8_t* const cur       = s->cur;
+        uint8_t* const rle_out   = s->rle_out;
+        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
+        if (is_v2 && t == 0) {
+          lvl_bytes = rle_bytes;
+        } else if (not is_v2 && t < 4) {
+          cur[t] = rle_bytes >> (t * 8);
         }
         __syncwarp();
         if (t == 0) { s->cur = rle_out; }
       }
     };
-    encode_levels(s->col.rep_values, s->col.num_rep_level_bits());
+    encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes);
     __syncthreads();
-    encode_levels(s->col.def_values, s->col.num_def_level_bits());
+    encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes);
   }
   // Encode data values
   __syncthreads();
@@ -1118,6 +1140,7 @@ __global__ void __launch_bounds__(128, 8)
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
+  uint32_t num_valid = 0;
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
     uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128);
     uint32_t len, pos;
@@ -1144,13 +1167,15 @@ __global__ void __launch_bounds__(128, 8)
       return std::make_tuple(is_valid, val_idx);
     }();
 
+    if (is_valid) num_valid++;
+
     cur_val_idx += nvals;
     if (dict_bits >= 0) {
       // Dictionary encoding
       if (dict_bits > 0) {
         uint32_t rle_numvals;
         uint32_t rle_numvals_in_block;
-        block_scan(temp_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
+        block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
         rle_numvals = s->rle_numvals;
         if (is_valid) {
           uint32_t v;
@@ -1190,7 +1215,7 @@ __global__ void __launch_bounds__(128, 8)
         len = 0;
       }
       uint32_t total_len = 0;
-      block_scan(temp_storage).ExclusiveSum(len, pos, total_len);
+      block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
       __syncthreads();
       if (t == 0) { s->cur = dst + total_len; }
       if (is_valid) {
@@ -1317,7 +1342,11 @@ __global__ void __launch_bounds__(128, 8)
       __syncthreads();
     }
   }
+
+  uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
+
   if (t == 0) {
+    s->page.num_nulls     = s->page.num_values - valid_count;
     uint8_t* base         = s->page.page_data + s->page.max_hdr_size;
     auto actual_data_size = static_cast<uint32_t>(s->cur - base);
     if (actual_data_size > s->page.max_data_size) {
@@ -1325,8 +1354,13 @@ __global__ void __launch_bounds__(128, 8)
     }
     s->page.max_data_size = actual_data_size;
     if (not comp_in.empty()) {
-      comp_in[blockIdx.x]  = {base, actual_data_size};
-      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, 0};  // size is unused
+      // V2 does not compress rep and def level data
+      size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+      comp_in[blockIdx.x]         = {base + skip_comp_size, actual_data_size - skip_comp_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
+                              0};  // size is unused
+      // copy uncompressed bytes over
+      memcpy(s->page.compressed_data + s->page.max_hdr_size, base, skip_comp_size);
     }
     pages[blockIdx.x] = s->page;
     if (not comp_results.empty()) {
@@ -1367,9 +1401,10 @@ __global__ void __launch_bounds__(decide_compression_block_size)
   for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) {
     auto const& curr_page     = ck_g[warp_id].pages[page_id];
     auto const page_data_size = curr_page.max_data_size;
+    auto const lvl_bytes      = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes;
     uncompressed_data_size += page_data_size;
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
-      compressed_data_size += comp_res->bytes_written;
+      compressed_data_size += comp_res->bytes_written + lvl_bytes;
       if (comp_res->status != compression_status::SUCCESS) {
         atomicOr(&compression_error[warp_id], 1);
       }
@@ -1493,6 +1528,13 @@ class header_encoder {
     current_header_ptr = cpw_put_int64(current_header_ptr, static_cast<int64_t>(value));
   }
 
+  inline __device__ void field_bool(int field, bool value)
+  {
+    current_header_ptr = cpw_put_fldh(
+      current_header_ptr, field, current_field_index, value ? ST_FLD_TRUE : ST_FLD_FALSE);
+    current_field_index = field;
+  }
+
   template <typename T>
   inline __device__ void field_int32(int field, T value)
   {
@@ -1812,8 +1854,10 @@ __global__ void __launch_bounds__(128)
     }
     uncompressed_page_size = page_g.max_data_size;
     if (ck_g.is_compressed) {
+      auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes;
       hdr_start            = page_g.compressed_data;
-      compressed_page_size = (uint32_t)comp_results[blockIdx.x].bytes_written;
+      compressed_page_size =
+        static_cast<uint32_t>(comp_results[blockIdx.x].bytes_written) + lvl_bytes;
       page_g.max_data_size = compressed_page_size;
     } else {
       hdr_start            = page_g.page_data;
@@ -1853,6 +1897,23 @@ __global__ void __launch_bounds__(128)
         encoder.field_struct_end(5);
       }
       encoder.field_struct_end(5);
+    } else if (page_type == PageType::DATA_PAGE_V2) {
+      encoder.field_struct_begin(8);
+      encoder.field_int32(1, page_g.num_values);
+      encoder.field_int32(2, page_g.num_nulls);
+      encoder.field_int32(3, page_g.num_rows);
+      encoder.field_int32(4, encoding);
+      encoder.field_int32(5, page_g.def_lvl_bytes);
+      encoder.field_int32(6, page_g.rep_lvl_bytes);
+      encoder.field_bool(7, ck_g.is_compressed);  // TODO can compress at page level now
+      // Optionally encode page-level statistics
+      if (not page_stats.empty()) {
+        encoder.field_struct_begin(8);
+        encoder.set_ptr(
+          EncodeStatistics(encoder.get_ptr(), &page_stats[blockIdx.x], col_g.stats_dtype, scratch));
+        encoder.field_struct_end(8);
+      }
+      encoder.field_struct_end(8);
     } else {
       // DictionaryPageHeader
       encoder.field_struct_begin(7);
@@ -2154,6 +2215,7 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                       size_t max_page_size_bytes,
                       size_type max_page_size_rows,
                       uint32_t page_align,
+                      bool write_v2_headers,
                       statistics_merge_group* page_grstats,
                       statistics_merge_group* chunk_grstats,
                       rmm::cuda_stream_view stream)
@@ -2170,7 +2232,8 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                                                      num_columns,
                                                      max_page_size_bytes,
                                                      max_page_size_rows,
-                                                     page_align);
+                                                     page_align,
+                                                     write_v2_headers);
 }
 
 void EncodePages(device_span<gpu::EncPage> pages,
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index a25c7fab712..a729f28d672 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -298,6 +298,20 @@ struct DataPageHeader {
   Encoding repetition_level_encoding = Encoding::PLAIN;  // Encoding used for repetition levels
 };
 
+/**
+ * @brief Thrift-derived struct describing the header for a V2 data page
+ */
+struct DataPageHeaderV2 {
+  int32_t num_values = 0;  // Number of values, including NULLs, in this data page.
+  int32_t num_nulls  = 0;  // Number of NULL values, in this data page.
+  int32_t num_rows   = 0;  // Number of rows in this data page. which means
+                           // pages change on record boundaries (r = 0)
+  Encoding encoding                     = Encoding::PLAIN;  // Encoding used for this data page
+  int32_t definition_levels_byte_length = 0;                // length of the definition levels
+  int32_t repetition_levels_byte_length = 0;                // length of the repetition levels
+  bool is_compressed                    = true;             // whether the values are compressed.
+};
+
 /**
  * @brief Thrift-derived struct describing the header for a dictionary page
  */
@@ -322,6 +336,7 @@ struct PageHeader {
   int32_t compressed_page_size   = 0;  // Compressed page size in bytes (not including the header)
   DataPageHeader data_page_header;
   DictionaryPageHeader dictionary_page_header;
+  DataPageHeaderV2 data_page_header_v2;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 97c71de9a9b..51d2f952a33 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -422,7 +422,10 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
+  uint32_t def_lvl_bytes;        //!< Number of bytes of encoded definition level data (V2 only)
+  uint32_t rep_lvl_bytes;        //!< Number of bytes of encoded repetition level data (V2 only)
   compression_result* comp_res;  //!< Ptr to compression result
+  uint32_t num_nulls;            //!< Number of null values (V2 only) (down here for alignment)
 };
 
 /**
@@ -648,6 +651,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] num_columns Number of columns
  * @param[in] page_grstats Setup for page-level stats
  * @param[in] page_align Required alignment for uncompressed pages
+ * @param[in] write_v2_headers True if V2 page headers should be written
  * @param[in] chunk_grstats Setup for chunk-level stats
  * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages
  * @param[in] stream CUDA stream to use, default 0
@@ -661,6 +665,7 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
                       size_t max_page_size_bytes,
                       size_type max_page_size_rows,
                       uint32_t page_align,
+                      bool write_v2_headers,
                       statistics_merge_group* page_grstats,
                       statistics_merge_group* chunk_grstats,
                       rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 17a0a903a47..06e7b6bfc8a 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -996,6 +996,7 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                      uint32_t num_columns,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
+                     bool write_v2_headers,
                      Compression compression_codec,
                      rmm::cuda_stream_view stream)
 {
@@ -1012,6 +1013,7 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         max_page_size_bytes,
                         max_page_size_rows,
                         page_alignment(compression_codec),
+                        write_v2_headers,
                         nullptr,
                         nullptr,
                         stream);
@@ -1036,6 +1038,7 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         max_page_size_bytes,
                         max_page_size_rows,
                         page_alignment(compression_codec),
+                        write_v2_headers,
                         nullptr,
                         nullptr,
                         stream);
@@ -1061,6 +1064,7 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         max_page_size_bytes,
                         max_page_size_rows,
                         page_alignment(compression_codec),
+                        write_v2_headers,
                         nullptr,
                         nullptr,
                         stream);
@@ -1197,6 +1201,7 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param compression Compression format
  * @param max_page_size_bytes Maximum uncompressed page size, in bytes
  * @param max_page_size_rows Maximum page size, in rows
+ * @param write_v2_headers True if version 2 page headers are to be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
@@ -1211,6 +1216,7 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         Compression compression,
                         size_t max_page_size_bytes,
                         size_type max_page_size_rows,
+                        bool write_v2_headers,
                         rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_merge_group> page_stats_mrg(num_stats_bfr, stream);
@@ -1224,6 +1230,7 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                    max_page_size_bytes,
                    max_page_size_rows,
                    page_alignment(compression),
+                   write_v2_headers,
                    (num_stats_bfr) ? page_stats_mrg.data() : nullptr,
                    (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr,
                    stream);
@@ -1424,6 +1431,7 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
  * @param max_dictionary_size Maximum dictionary size, in bytes
  * @param single_write_mode Flag to indicate that we are guaranteeing a single table write
  * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
+ * @param write_v2_headers True if V2 page headers are to be written
  * @param out_sink Sink for checking if device write is supported, should not be used to write any
  *        data in this function
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -1447,6 +1455,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    size_t max_dictionary_size,
                                    single_write_mode write_mode,
                                    bool int96_timestamps,
+                                   bool write_v2_headers,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
@@ -1764,8 +1773,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Build chunk dictionaries and count pages. Sends chunks to device.
-  cudf::detail::hostdevice_vector<size_type> comp_page_sizes = init_page_sizes(
-    chunks, col_desc, num_columns, max_page_size_bytes, max_page_size_rows, compression, stream);
+  cudf::detail::hostdevice_vector<size_type> comp_page_sizes = init_page_sizes(chunks,
+                                                                               col_desc,
+                                                                               num_columns,
+                                                                               max_page_size_bytes,
+                                                                               max_page_size_rows,
+                                                                               write_v2_headers,
+                                                                               compression,
+                                                                               stream);
 
   // Find which partition a rg belongs to
   std::vector<int> rg_to_part;
@@ -1878,6 +1893,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                        compression,
                        max_page_size_bytes,
                        max_page_size_rows,
+                       write_v2_headers,
                        stream);
   }
 
@@ -1982,6 +1998,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
     _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _write_v2_headers(options.is_enabled_write_v2_headers()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
@@ -2009,6 +2026,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
     _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _write_v2_headers(options.is_enabled_write_v2_headers()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
@@ -2085,6 +2103,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _max_dictionary_size,
                                            _single_write_mode,
                                            _int96_timestamps,
+                                           _write_v2_headers,
                                            _out_sink,
                                            _stream);
     } catch (...) {  // catch any exception type
@@ -2199,7 +2218,7 @@ void writer::impl::write_parquet_data_to_sink(
             auto const& enc_page = h_pages[curr_page_idx++];
 
             // skip dict pages
-            if (enc_page.page_type != PageType::DATA_PAGE) { continue; }
+            if (enc_page.page_type == PageType::DICTIONARY_PAGE) { continue; }
 
             int32_t this_page_size = enc_page.hdr_size + enc_page.max_data_size;
             // first_row_idx is relative to start of row group
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index a6c55e04b96..89ef85ba2bd 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -161,6 +161,7 @@ class writer::impl {
   size_t const _max_dictionary_size;
   std::optional<size_type> const _max_page_fragment_size;
   bool const _int96_timestamps;
+  bool const _write_v2_headers;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
   single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index a86190239fe..8c7d598d33f 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -376,7 +376,7 @@ struct ParquetChunkedWriterNumericTypeTest : public ParquetChunkedWriterTest {
 TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes);
 
 // Base test fixture for size-parameterized tests
-class ParquetSizedTest : public ::testing::TestWithParam<int> {};
+class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam<int> {};
 
 // test the allowed bit widths for dictionary encoding
 // values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries
@@ -385,6 +385,13 @@ INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest,
                          testing::Range(1, 25),
                          testing::PrintToStringParamName());
 
+// Base test fixture for V2 header tests
+class ParquetV2Test : public ::cudf::test::BaseFixtureWithParam<bool> {};
+INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest,
+                         ParquetV2Test,
+                         testing::Bool(),
+                         testing::PrintToStringParamName());
+
 namespace {
 // Generates a vector of uniform random values of type T
 template <typename T>
@@ -594,9 +601,10 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, MultiColumn)
+TEST_P(ParquetV2Test, MultiColumn)
 {
   constexpr auto num_rows = 100000;
+  auto const is_v2        = GetParam();
 
   // auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -645,6 +653,7 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(is_v2)
       .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
@@ -656,9 +665,10 @@ TEST_F(ParquetWriterTest, MultiColumn)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, MultiColumnWithNulls)
+TEST_P(ParquetV2Test, MultiColumnWithNulls)
 {
   constexpr auto num_rows = 100;
+  auto const is_v2        = GetParam();
 
   // auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -715,6 +725,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(is_v2)
       .metadata(expected_metadata);
 
   cudf::io::write_parquet(out_opts);
@@ -730,8 +741,10 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, Strings)
+TEST_P(ParquetV2Test, Strings)
 {
+  auto const is_v2 = GetParam();
+
   std::vector<char const*> strings{
     "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
   auto const num_rows = strings.size();
@@ -754,6 +767,7 @@ TEST_F(ParquetWriterTest, Strings)
   auto filepath = temp_env->get_temp_filepath("Strings.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(is_v2)
       .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
@@ -823,13 +837,14 @@ TEST_F(ParquetWriterTest, StringsAsBinary)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, SlicedTable)
+TEST_P(ParquetV2Test, SlicedTable)
 {
   // This test checks for writing zero copy, offsetted views into existing cudf tables
 
   std::vector<char const*> strings{
     "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
   auto const num_rows = strings.size();
+  auto const is_v2    = GetParam();
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
@@ -926,6 +941,7 @@ TEST_F(ParquetWriterTest, SlicedTable)
   auto filepath = temp_env->get_temp_filepath("SlicedTable.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(is_v2)
       .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
@@ -937,8 +953,10 @@ TEST_F(ParquetWriterTest, SlicedTable)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, ListColumn)
+TEST_P(ParquetV2Test, ListColumn)
 {
+  auto const is_v2 = GetParam();
+
   auto valids  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
   auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
 
@@ -1023,6 +1041,7 @@ TEST_F(ParquetWriterTest, ListColumn)
 
   auto filepath = temp_env->get_temp_filepath("ListColumn.parquet");
   auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+                    .write_v2_headers(is_v2)
                     .metadata(expected_metadata)
                     .compression(cudf::io::compression_type::NONE);
 
@@ -1198,8 +1217,10 @@ TEST_F(ParquetWriterTest, Struct)
   cudf::io::read_parquet(read_args);
 }
 
-TEST_F(ParquetWriterTest, StructOfList)
+TEST_P(ParquetV2Test, StructOfList)
 {
+  auto const is_v2 = GetParam();
+
   // Struct<is_human:bool,
   //        Struct<weight:float,
   //               ages:int,
@@ -1262,6 +1283,7 @@ TEST_F(ParquetWriterTest, StructOfList)
   auto filepath = temp_env->get_temp_filepath("StructOfList.parquet");
   cudf::io::parquet_writer_options args =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(is_v2)
       .metadata(expected_metadata);
   cudf::io::write_parquet(args);
 
@@ -1273,8 +1295,10 @@ TEST_F(ParquetWriterTest, StructOfList)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, ListOfStruct)
+TEST_P(ParquetV2Test, ListOfStruct)
 {
+  auto const is_v2 = GetParam();
+
   // List<Struct<is_human:bool,
   //             Struct<weight:float,
   //                    ages:int,
@@ -1314,6 +1338,7 @@ TEST_F(ParquetWriterTest, ListOfStruct)
   auto filepath = temp_env->get_temp_filepath("ListOfStruct.parquet");
   cudf::io::parquet_writer_options args =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(is_v2)
       .metadata(expected_metadata);
   cudf::io::write_parquet(args);
 
@@ -1457,8 +1482,10 @@ TEST_F(ParquetWriterTest, PartitionedWrite)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
+TEST_P(ParquetV2Test, PartitionedWriteEmptyPartitions)
 {
+  auto const is_v2 = GetParam();
+
   auto source = create_random_fixed_table<int>(4, 4, false);
 
   auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
@@ -1476,6 +1503,7 @@ TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
     cudf::io::parquet_writer_options::builder(
       cudf::io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
       .partitions({partition1, partition2})
+      .write_v2_headers(is_v2)
       .compression(cudf::io::compression_type::NONE);
   cudf::io::write_parquet(args);
 
@@ -1488,8 +1516,10 @@ TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, PartitionedWriteEmptyColumns)
+TEST_P(ParquetV2Test, PartitionedWriteEmptyColumns)
 {
+  auto const is_v2 = GetParam();
+
   auto source = create_random_fixed_table<int>(0, 4, false);
 
   auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
@@ -1507,6 +1537,7 @@ TEST_F(ParquetWriterTest, PartitionedWriteEmptyColumns)
     cudf::io::parquet_writer_options::builder(
       cudf::io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
       .partitions({partition1, partition2})
+      .write_v2_headers(is_v2)
       .compression(cudf::io::compression_type::NONE);
   cudf::io::write_parquet(args);
 
@@ -4082,7 +4113,7 @@ int32_t compare_binary(std::vector<uint8_t> const& v1,
   return 0;
 }
 
-TEST_F(ParquetWriterTest, LargeColumnIndex)
+TEST_P(ParquetV2Test, LargeColumnIndex)
 {
   // create a file large enough to be written in 2 batches (currently 1GB per batch)
   // pick fragment size that num_rows is divisible by, so we'll get equal sized row groups
@@ -4090,6 +4121,7 @@ TEST_F(ParquetWriterTest, LargeColumnIndex)
   const std::string s2(1000, 'b');
   constexpr auto num_rows  = 512 * 1024;
   constexpr auto frag_size = num_rows / 128;
+  auto const is_v2         = GetParam();
 
   auto col0_elements = cudf::detail::make_counting_transform_iterator(
     0, [&](auto i) { return (i < num_rows) ? s1 : s2; });
@@ -4103,6 +4135,7 @@ TEST_F(ParquetWriterTest, LargeColumnIndex)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
       .compression(cudf::io::compression_type::NONE)
       .dictionary_policy(cudf::io::dictionary_policy::NEVER)
+      .write_v2_headers(is_v2)
       .max_page_fragment_size(frag_size)
       .row_group_size_bytes(1024 * 1024 * 1024)
       .row_group_size_rows(num_rows);
@@ -4129,9 +4162,12 @@ TEST_F(ParquetWriterTest, LargeColumnIndex)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnOffsetIndex)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
   constexpr auto num_rows = 100000;
+  auto const is_v2        = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4169,6 +4205,7 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndex)
   const cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
       .max_page_size_rows(20000);
   cudf::io::write_parquet(out_opts);
 
@@ -4190,9 +4227,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndex)
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
-        num_vals += ph.data_page_header.num_values;
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
       }
 
       // loop over page stats from the column index. check that stats.min <= page.min
@@ -4216,9 +4253,12 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndex)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 {
   constexpr auto num_rows = 100000;
+  auto const is_v2        = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4266,6 +4306,7 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
   const cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
       .max_page_size_rows(20000);
   cudf::io::write_parquet(out_opts);
 
@@ -4287,9 +4328,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
-        num_vals += ph.data_page_header.num_values;
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
       }
 
       // loop over page stats from the column index. check that stats.min <= page.min
@@ -4319,9 +4360,12 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 {
   constexpr auto num_rows = 100000;
+  auto const is_v2        = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4354,6 +4398,7 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
   const cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
       .max_page_size_rows(20000);
   cudf::io::write_parquet(out_opts);
 
@@ -4375,9 +4420,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
-        num_vals += ph.data_page_header.num_values;
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
       }
 
       // loop over page stats from the column index. check that stats.min <= page.min
@@ -4411,8 +4456,12 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 {
+  auto const is_v2 = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+
   auto c0 = testdata::ascending<uint32_t>();
 
   auto sc0 = testdata::ascending<cudf::string_view>();
@@ -4441,6 +4490,7 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
   const cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
       .max_page_size_rows(page_size_for_ordered_tests);
   cudf::io::write_parquet(out_opts);
 
@@ -4466,10 +4516,11 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        EXPECT_EQ(ph.type, expected_hdr_type);
+        EXPECT_EQ(page_loc.first_row_index, num_vals);
         // last column has 2 values per row
-        EXPECT_EQ(page_loc.first_row_index * (c == rg.columns.size() - 1 ? 2 : 1), num_vals);
-        num_vals += ph.data_page_header.num_values;
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows
+                          : ph.data_page_header.num_values / (c == rg.columns.size() - 1 ? 2 : 1);
       }
 
       // loop over page stats from the column index. check that stats.min <= page.min
@@ -4489,8 +4540,86 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
+{
+  auto const is_v2 = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+
+  auto validity2 =
+    cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; });
+  auto validity3 = cudf::detail::make_counting_transform_iterator(
+    0, [](cudf::size_type i) { return (i % 3) != 0; });
+  auto validity4 = cudf::detail::make_counting_transform_iterator(
+    0, [](cudf::size_type i) { return (i % 4) != 0; });
+  auto validity5 = cudf::detail::make_counting_transform_iterator(
+    0, [](cudf::size_type i) { return (i % 5) != 0; });
+
+  auto c0 = testdata::ascending<uint32_t>();
+
+  auto col1_data = random_values<int32_t>(num_ordered_rows);
+  auto col2_data = random_values<int32_t>(num_ordered_rows);
+  auto col3_data = random_values<int32_t>(num_ordered_rows);
+
+  // col1 is all nulls
+  auto col1 =
+    cudf::test::fixed_width_column_wrapper<int32_t>(col1_data.begin(), col1_data.end(), validity2);
+  auto col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t>(col2_data.begin(), col2_data.end(), validity3);
+  auto col3 =
+    cudf::test::fixed_width_column_wrapper<int32_t>(col2_data.begin(), col2_data.end(), validity4);
+
+  std::vector<std::unique_ptr<cudf::column>> struct_children;
+  struct_children.push_back(col1.release());
+  struct_children.push_back(col2.release());
+  struct_children.push_back(col3.release());
+  auto struct_validity = std::vector<bool>(validity5, validity5 + num_ordered_rows);
+  cudf::test::structs_column_wrapper c1(std::move(struct_children), struct_validity);
+  table_view expected({c0, c1});
+
+  auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexStructNulls.parquet");
+  const cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
+      .max_page_size_rows(page_size_for_ordered_tests);
+  cudf::io::write_parquet(out_opts);
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
+
+  read_footer(source, &fmd);
+
+  for (size_t r = 0; r < fmd.row_groups.size(); r++) {
+    auto const& rg = fmd.row_groups[r];
+    for (size_t c = 0; c < rg.columns.size(); c++) {
+      auto const& chunk = rg.columns[c];
+
+      // loop over offsets, read each page header, make sure it's a data page and that
+      // the first row index is correct
+      auto const oi = read_offset_index(source, chunk);
+      auto const ci = read_column_index(source, chunk);
+
+      int64_t num_vals = 0;
+      for (size_t o = 0; o < oi.page_locations.size(); o++) {
+        auto const& page_loc = oi.page_locations[o];
+        auto const ph        = read_page_header(source, page_loc);
+        EXPECT_EQ(ph.type, expected_hdr_type);
+        EXPECT_EQ(page_loc.first_row_index, num_vals);
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
+        // check that null counts match
+        if (is_v2) { EXPECT_EQ(ci.null_counts[o], ph.data_page_header_v2.num_nulls); }
+      }
+    }
+  }
+}
+
+TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
 {
+  auto const is_v2 = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
   using lcw = cudf::test::lists_column_wrapper<int32_t>;
@@ -4576,6 +4705,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
   auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet");
   auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
                     .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+                    .write_v2_headers(is_v2)
                     .compression(cudf::io::compression_type::NONE);
 
   cudf::io::write_parquet(out_opts);
@@ -4594,14 +4724,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
       // the first row index is correct
       auto const oi = read_offset_index(source, chunk);
 
-      int64_t num_vals = 0;
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
-        // last column has 2 values per row
-        EXPECT_EQ(page_loc.first_row_index * (c == rg.columns.size() - 1 ? 2 : 1), num_vals);
-        num_vals += ph.data_page_header.num_values;
+        EXPECT_EQ(ph.type, expected_hdr_type);
+        // check null counts in V2 header
+        if (is_v2) { EXPECT_EQ(ph.data_page_header_v2.num_nulls, expected_null_counts[c]); }
       }
 
       // check null counts in column chunk stats and page indexes

From 20c3aaba97d69bccd0e0bac90d57ec722437bd9b Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 15 Aug 2023 14:43:40 -0700
Subject: [PATCH 058/230] Refactor Parquet reader handling of V2 page header
 info (#13775)

This PR replaces the `def_lvl_bytes` and `rep_lvl_bytes` fields of the `gpu::PageInfo` struct with an array indexed by `gpu::level_type` (as is done with the `lvl_decode_buf` array). This allows for some streamlining in `InitLevelSection()`, removing some redundant code and improving readability.

See this [comment](https://github.com/rapidsai/cudf/pull/13707#discussion_r1274118429) for context.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13775
---
 cpp/src/io/parquet/page_decode.cuh           | 11 ++---------
 cpp/src/io/parquet/page_hdr.cu               | 10 +++++-----
 cpp/src/io/parquet/parquet_gpu.hpp           |  3 +--
 cpp/src/io/parquet/reader_impl_preprocess.cu |  3 ++-
 4 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 4c4607150ce..b2c09980b6e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -902,17 +902,10 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
   // this is a little redundant. if level_bits == 0, then nothing should be encoded
   // for the level, but some V2 files in the wild violate this and encode the data anyway.
   // thus we will handle V2 headers separately.
-  if ((s->page.flags & PAGEINFO_FLAGS_V2) != 0) {
+  if ((s->page.flags & PAGEINFO_FLAGS_V2) != 0 && (len = s->page.lvl_bytes[lvl]) != 0) {
     // V2 only uses RLE encoding so no need to check encoding
-    len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
     s->abs_lvl_start[lvl] = cur;
-    if (len == 0) {
-      s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
-      s->initial_rle_value[lvl] = 0;
-      s->lvl_start[lvl]         = cur;
-    } else {
-      init_rle(cur, cur + len);
-    }
+    init_rle(cur, cur + len);
   } else if (level_bits == 0) {
     len                       = 0;
     s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index e5dd029fde2..1fc1b8faddc 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -310,8 +310,8 @@ struct gpuParseDataPageHeaderV2 {
                                  ParquetFieldInt32(2, bs->page.num_nulls),
                                  ParquetFieldInt32(3, bs->page.num_rows),
                                  ParquetFieldEnum<Encoding>(4, bs->page.encoding),
-                                 ParquetFieldInt32(5, bs->page.def_lvl_bytes),
-                                 ParquetFieldInt32(6, bs->page.rep_lvl_bytes));
+                                 ParquetFieldInt32(5, bs->page.lvl_bytes[level_type::DEFINITION]),
+                                 ParquetFieldInt32(6, bs->page.lvl_bytes[level_type::REPETITION]));
     return parse_header(op, bs);
   }
 };
@@ -388,9 +388,9 @@ __global__ void __launch_bounds__(128)
         bs->page.num_rows = 0;
         bs->page.flags    = 0;
         // zero out V2 info
-        bs->page.num_nulls     = 0;
-        bs->page.def_lvl_bytes = 0;
-        bs->page.rep_lvl_bytes = 0;
+        bs->page.num_nulls                         = 0;
+        bs->page.lvl_bytes[level_type::DEFINITION] = 0;
+        bs->page.lvl_bytes[level_type::REPETITION] = 0;
         if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) {
           switch (bs->page_type) {
             case PageType::DATA_PAGE:
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 51d2f952a33..b3c9d0765fa 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -154,8 +154,7 @@ struct PageInfo {
   int32_t uncompressed_page_size;  // uncompressed data size in bytes
   // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length
   // indicator. instead the lengths for these are stored in the header.
-  int32_t def_lvl_bytes;  // length of the definition levels (V2 header)
-  int32_t rep_lvl_bytes;  // length of the repetition levels (V2 header)
+  int32_t lvl_bytes[level_type::NUM_LEVEL_TYPES];  // length of the rep/def levels (V2 header)
   // Number of values in this data page or dictionary.
   // Important : the # of input values does not necessarily
   // correspond to the number of rows in the output. It just reflects the number
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index c7e3de03312..1ea89f5f694 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -480,7 +480,8 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
       auto& page          = pages[page_idx];
       // offset will only be non-zero for V2 pages
-      auto const offset = page.def_lvl_bytes + page.rep_lvl_bytes;
+      auto const offset =
+        page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION];
       // for V2 need to copy def and rep level info into place, and then offset the
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.

From 709b15f8e9ff7be4205cf564e0722ab65cb6b800 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 16 Aug 2023 09:09:47 -0400
Subject: [PATCH 059/230] Fix nvtext::generate_character_ngrams performance
 regression for longer strings (#13874)

Fixes performance regression when generating character ngrams. The regression was introduced as part of refactoring common code when adding the `nvtext::hash_character_ngrams` function (Reference #13654). Defactoring the code fixed the regression. Overall, these functions only share about 6 lines of code in common so the defactoring is expected to require minimal maintenance.
The defactoring involves re-instating the original kernel code logic for `nvtext::generate_character_ngrams`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13874
---
 cpp/src/text/generate_ngrams.cu | 115 ++++++++++++--------------------
 1 file changed, 42 insertions(+), 73 deletions(-)

diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index e0a5b860005..938fd45246d 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -160,82 +160,41 @@ namespace detail {
 namespace {
 
 /**
- * @brief Base class for generating character ngrams
- *
- * The ngrams are produced for each string and the derived class's
- * `process_ngram` function is called for each ngram/substring.
+ * @brief Generate character ngrams for each string
  *
- * @tparam Derived class uses the CRTP pattern to reuse code logic.
+ * Each string produces many strings depending on the ngram width and the string size.
+ * This functor can be used with `make_strings_children` to build the offsets and
+ * the chars child columns.
  */
-template <typename Derived>
-struct base_character_ngram_fn {
+struct character_ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::size_type const* d_ngram_offsets{};
+  cudf::size_type* d_offsets{};
+  char* d_chars{};
 
-  base_character_ngram_fn(cudf::column_device_view const& d_strings,
-                          cudf::size_type ngrams,
-                          cudf::size_type const* d_ngram_offsets)
-    : d_strings(d_strings), ngrams(ngrams), d_ngram_offsets(d_ngram_offsets)
-  {
-  }
-
-  __device__ void operator()(cudf::size_type idx) const
+  __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) return;
     auto const d_str = d_strings.element<cudf::string_view>(idx);
     if (d_str.empty()) return;
-    auto const& derived     = static_cast<Derived const&>(*this);
     auto itr                = d_str.begin();
     auto const ngram_offset = d_ngram_offsets[idx];
     auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
+    auto d_sizes            = d_offsets + ngram_offset;
+    auto out_ptr            = d_chars ? d_chars + *d_sizes : nullptr;
     for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
       auto const begin = itr.byte_offset();
       auto const end   = (itr + ngrams).byte_offset();
-      auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
-      derived.process_ngram(ngram, n + ngram_offset);
-    }
-  }
-};
-
-/**
- * @brief Generate character ngrams for each string
- *
- * Each string produces many strings depending on the ngram width and the string size.
- * This functor can be used with `make_strings_children` to build the offsets and
- * the chars child columns.
- */
-struct character_ngram_generator_fn : base_character_ngram_fn<character_ngram_generator_fn> {
-  cudf::size_type* d_offsets{};
-  char* d_chars{};
-
-  character_ngram_generator_fn(cudf::column_device_view const& d_strings,
-                               cudf::size_type ngrams,
-                               cudf::size_type const* d_ngram_offsets)
-    : base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets)
-  {
-  }
-
-  /**
-   * @brief Called through the base class for each ngram
-   *
-   * Either stores the size of each string or copies the string to the output
-   *
-   * @param d_str The ngram substring to process
-   * @param offset The output position relative to d_offsets
-   */
-  __device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const
-  {
-    auto d_str_offsets = d_offsets + offset;
-    if (d_chars) {
-      auto out_ptr = d_chars + *d_str_offsets;
-      cudf::strings::detail::copy_string(out_ptr, d_str);
-    } else {
-      *d_str_offsets = d_str.size_bytes();
+      if (d_chars) {
+        out_ptr =
+          cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
+      } else {
+        *d_sizes++ = end - begin;
+      }
     }
   }
 };
-
 }  // namespace
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
@@ -253,7 +212,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto const d_strings      = *strings_column;
 
   // create a vector of ngram offsets for each string
-  rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
+  rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -262,7 +221,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
     [d_strings, strings_count, ngrams] __device__(auto idx) {
       if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
       auto const length = d_strings.element<cudf::string_view>(idx).length();
-      return std::max(0, static_cast<int32_t>(length + 1 - ngrams));
+      return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
     },
     cudf::size_type{0},
     thrust::plus<cudf::size_type>());
@@ -282,23 +241,33 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 
 namespace {
 /**
- * @brief Computes the hash of each ngram as produced by the base class
+ * @brief Computes the hash of each character ngram
+ *
+ * Each thread processes a single string. Substrings are resolved for every character
+ * of the string and hashed.
  */
-struct character_ngram_hash_fn : base_character_ngram_fn<character_ngram_hash_fn> {
-  cudf::hash_value_type* d_hashes;
-
-  character_ngram_hash_fn(cudf::column_device_view const& d_strings,
-                          cudf::size_type ngrams,
-                          cudf::size_type const* d_ngram_offsets,
-                          cudf::hash_value_type* d_hashes)
-    : base_character_ngram_fn(d_strings, ngrams, d_ngram_offsets), d_hashes(d_hashes)
-  {
-  }
+struct character_ngram_hash_fn {
+  cudf::column_device_view const d_strings;
+  cudf::size_type ngrams;
+  cudf::size_type const* d_ngram_offsets;
+  cudf::hash_value_type* d_results;
 
-  __device__ void process_ngram(cudf::string_view d_str, cudf::size_type offset) const
+  __device__ void operator()(cudf::size_type idx) const
   {
-    auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
-    d_hashes[offset]  = hasher(d_str);
+    if (d_strings.is_null(idx)) return;
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    if (d_str.empty()) return;
+    auto itr                = d_str.begin();
+    auto const ngram_offset = d_ngram_offsets[idx];
+    auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
+    auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+    auto d_hashes           = d_results + ngram_offset;
+    for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
+      auto const begin = itr.byte_offset();
+      auto const end   = (itr + ngrams).byte_offset();
+      auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
+      *d_hashes++      = hasher(ngram);
+    }
   }
 };
 }  // namespace

From 5d5032dd920a0d2dcd3c0406a9d3745670da2432 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 16 Aug 2023 18:48:43 +0200
Subject: [PATCH 060/230] Fix read out of bounds in string concatenate (#13838)

If data is sufficiently large, `fused_concatenate_string_chars_kernel` will attempt to read out of bounds and ultimately cause CUDA to raise `cudaErrorIllegalAddress`. Details on how the issue was encountered are in https://github.com/rapidsai/cudf/issues/13771, although this was an [already known problem](https://github.com/rapidsai/cudf/issues/10333#issuecomment-1048229610).

Fixes #13771 .

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13838
---
 cpp/src/strings/copying/concatenate.cu  |  6 ++---
 cpp/tests/copying/concatenate_tests.cpp | 31 +++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 287910c9a6f..26cd4fff09b 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -121,8 +121,8 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
                                                        bitmask_type* output_mask,
                                                        size_type* out_valid_count)
 {
-  size_type output_index     = threadIdx.x + blockIdx.x * blockDim.x;
-  size_type warp_valid_count = 0;
+  cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
+  size_type warp_valid_count           = 0;
 
   unsigned active_mask;
   if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
@@ -175,7 +175,7 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
                                                       size_type const output_size,
                                                       char* output_data)
 {
-  size_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
+  cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
 
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 0c6394637fc..c81f1772d10 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -162,6 +162,37 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
+{
+  // Test large concatenate, causes out of bound device memory errors if kernel
+  // indexing is not int64_t.
+  // 1.5GB bytes, 5k columns
+  constexpr size_t num_strings        = 10000;
+  constexpr size_t string_length      = 150000;
+  constexpr size_t strings_per_column = 2;
+  constexpr size_t num_columns        = num_strings / strings_per_column;
+
+  std::vector<std::string> strings;
+  std::vector<char const*> h_strings;
+  std::vector<cudf::test::strings_column_wrapper> strings_column_wrappers;
+  std::vector<cudf::column_view> strings_columns;
+
+  std::string s(string_length, 'a');
+  for (size_t i = 0; i < num_strings; ++i)
+    h_strings.push_back(s.data());
+
+  for (size_t i = 0; i < num_columns; ++i)
+    strings_column_wrappers.push_back(cudf::test::strings_column_wrapper(
+      h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column));
+  for (auto& wrapper : strings_column_wrappers)
+    strings_columns.push_back(wrapper);
+
+  auto results = cudf::concatenate(strings_columns);
+
+  cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringColumnTest, ConcatenateTooManyColumns)
 {
   std::vector<char const*> h_strings{"aaa",

From 42808a4d45056ced21e855b1020a4b15b3e6a1a3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 16 Aug 2023 13:06:41 -0500
Subject: [PATCH 061/230] Raise error when trying to construct a `DataFrame`
 with mixed types (#13889)

Continuation to https://github.com/rapidsai/cudf/pull/13768/, In #13768 we prevented construction of mixed types in `Index` & `Series`. This PR implements the same for `DataFrame`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/13889
---
 python/cudf/cudf/core/column/column.py   | 15 ++++++++++++++-
 python/cudf/cudf/tests/test_dataframe.py |  9 +++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index bd0e051c4b7..c1ad5de1181 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2213,7 +2213,20 @@ def as_column(
                 pa.Array.from_pandas(interval_series), dtype=arb_dtype
             )
         elif arb_dtype.kind in ("O", "U"):
-            data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype)
+            pyarrow_array = pa.Array.from_pandas(arbitrary)
+            if not isinstance(
+                pyarrow_array,
+                (
+                    pa.ListArray,
+                    pa.StructArray,
+                    pa.NullArray,
+                    pa.Decimal128Array,
+                    pa.StringArray,
+                    pa.BooleanArray,
+                ),
+            ):
+                raise MixedTypeError("Cannot create column with mixed types")
+            data = as_column(pyarrow_array, dtype=arb_dtype)
         else:
             data = as_column(
                 pa.array(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 97e399a9cd5..2e1e20dee40 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10279,3 +10279,12 @@ def test_dataframe_constructor_from_namedtuple():
         cudf.DataFrame(data, index=idx)
     with pytest.raises(ValueError):
         pd.DataFrame(data, index=idx)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["datetime64[ns]", "timedelta64[ns]", "int64", "float32"]
+)
+def test_dataframe_mixed_dtype_error(dtype):
+    pdf = pd.Series([1, 2, 3], dtype=dtype).to_frame().astype(object)
+    with pytest.raises(TypeError):
+        cudf.from_pandas(pdf)

From b6dbe41ddbd24f202424055403199be16ad6665c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 16 Aug 2023 14:19:42 -0400
Subject: [PATCH 062/230] Add cudf::strings::find function with target per row
 (#13808)

Adds new strings find function
```
std::unique_ptr<column> find(
  strings_column_view const& input,
  strings_column_view const& target,
  size_type start,
  rmm::mr::device_memory_resource* mr);
```
The output of row `i` is the character position of the target string for row `i`  within input string of row `i` starting at the character position `start`.  If the target is not found within the input string, -1 is returned for that row entry in the output column.

Closes #12013

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13808
---
 cpp/include/cudf/strings/find.hpp |  29 +++++++-
 cpp/src/strings/search/find.cu    | 112 +++++++++++++++++++++++-------
 cpp/tests/strings/find_tests.cpp  |  68 ++++++++++++++++--
 3 files changed, 178 insertions(+), 31 deletions(-)

diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index 4f4b71ac82d..2fed36862b9 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,6 +87,33 @@ std::unique_ptr<column> rfind(
   size_type stop                      = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a column of character position values where the target
+ * string is first found in the corresponding string of the provided column
+ *
+ * The output of row `i` is the character position of the target string for row `i`
+ * within input string of row `i` starting at the character position `start`.
+ * If the target is not found within the input string, -1 is returned for that
+ * row entry in the output column.
+ *
+ * Any null input or target entries return corresponding null output column entries.
+ *
+ * @throw cudf::logic_error if `input.size() != target.size()`
+ *
+ * @param input Strings to search against
+ * @param target Strings to search for in `input`
+ * @param start First character position to include in the search
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New integer column with character position values
+ */
+std::unique_ptr<column> find(
+  strings_column_view const& input,
+  strings_column_view const& target,
+  size_type start                     = 0,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a column of boolean values for each string where true indicates
  * the target string was found within that string in the provided column.
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index e5ce88e7583..3de9dd34d83 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -33,6 +34,7 @@
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
@@ -56,17 +58,19 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
 /**
  * @brief Find function handles a string per thread
  */
-template <bool forward = true>
+template <typename TargetIterator, bool forward = true>
 struct finder_fn {
   column_device_view const d_strings;
-  string_view const d_target;
+  TargetIterator const d_targets;
   size_type const start;
   size_type const stop;
 
   __device__ size_type operator()(size_type idx) const
   {
     if (d_strings.is_null(idx)) { return -1; }
-    auto d_str = d_strings.element<string_view>(idx);
+    auto const d_str = d_strings.element<string_view>(idx);
+    if (d_str.empty() && (start > 0)) { return -1; }
+    auto const d_target = d_targets[idx];
 
     auto const length = d_str.length();
     auto const begin  = (start > length) ? length : start;
@@ -110,9 +114,9 @@ struct empty_target_fn {
 /**
  * @brief String per warp function for find/rfind
  */
-template <bool forward = true>
+template <typename TargetIterator, bool forward = true>
 __global__ void finder_warp_parallel_fn(column_device_view const d_strings,
-                                        string_view const d_target,
+                                        TargetIterator const d_targets,
                                         size_type const start,
                                         size_type const stop,
                                         size_type* d_results)
@@ -130,7 +134,8 @@ __global__ void finder_warp_parallel_fn(column_device_view const d_strings,
   if (lane_idx == 0) { d_results[str_idx] = forward ? std::numeric_limits<size_type>::max() : -1; }
   __syncwarp();
 
-  auto const d_str = d_strings.element<string_view>(str_idx);
+  auto const d_str    = d_strings.element<string_view>(str_idx);
+  auto const d_target = d_targets[str_idx];
 
   auto const [begin, left_over] = bytes_to_character_position(d_str, start);
   auto const start_char_pos     = start - left_over;  // keep track of character position
@@ -171,6 +176,33 @@ __global__ void finder_warp_parallel_fn(column_device_view const d_strings,
   }
 }
 
+template <typename TargetIterator, bool forward = true>
+void find_utility(strings_column_view const& input,
+                  TargetIterator const& target_itr,
+                  column& output,
+                  size_type start,
+                  size_type stop,
+                  rmm::cuda_stream_view stream)
+{
+  auto d_strings = column_device_view::create(input.parent(), stream);
+  auto d_results = output.mutable_view().data<size_type>();
+  if ((input.chars_size() / (input.size() - input.null_count())) > AVG_CHAR_BYTES_THRESHOLD) {
+    // warp-per-string runs faster for longer strings (but not shorter ones)
+    constexpr int block_size = 256;
+    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+    finder_warp_parallel_fn<TargetIterator, forward>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, target_itr, start, stop, d_results);
+  } else {
+    // string-per-thread function
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(input.size()),
+                      d_results,
+                      finder_fn<TargetIterator, forward>{*d_strings, target_itr, start, stop});
+  }
+}
+
 template <bool forward = true>
 std::unique_ptr<column> find_fn(strings_column_view const& input,
                                 string_scalar const& target,
@@ -183,9 +215,6 @@ std::unique_ptr<column> find_fn(strings_column_view const& input,
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
   if ((stop > 0) && (start > stop)) CUDF_FAIL("Parameter start must be less than stop.");
 
-  auto d_target  = string_view(target.data(), target.size());
-  auto d_strings = column_device_view::create(input.parent(), stream);
-
   // create output column
   auto results = make_numeric_column(data_type{type_to_id<size_type>()},
                                      input.size(),
@@ -196,32 +225,24 @@ std::unique_ptr<column> find_fn(strings_column_view const& input,
   // if input is empty or all-null then we are done
   if (input.size() == input.null_count()) { return results; }
 
-  auto d_results = results->mutable_view().data<size_type>();
+  auto d_target = string_view(target.data(), target.size());
 
+  // special logic for empty target results
   if (d_target.empty()) {
-    // special logic for empty target results
+    auto d_strings = column_device_view::create(input.parent(), stream);
+    auto d_results = results->mutable_view().data<size_type>();
     thrust::transform(rmm::exec_policy(stream),
                       thrust::counting_iterator<size_type>(0),
                       thrust::counting_iterator<size_type>(input.size()),
                       d_results,
                       empty_target_fn<forward>{*d_strings, start, stop});
-  } else if ((input.chars_size() / (input.size() - input.null_count())) >
-             AVG_CHAR_BYTES_THRESHOLD) {
-    // warp-per-string runs faster for longer strings (but not shorter ones)
-    constexpr int block_size = 256;
-    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
-    finder_warp_parallel_fn<forward>
-      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-        *d_strings, d_target, start, stop, d_results);
-  } else {
-    // string-per-thread function
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(input.size()),
-                      d_results,
-                      finder_fn<forward>{*d_strings, d_target, start, stop});
+    return results;
   }
 
+  // find-utility function fills in the results column
+  auto target_itr      = thrust::make_constant_iterator(d_target);
+  using TargetIterator = decltype(target_itr);
+  find_utility<TargetIterator, forward>(input, target_itr, *results, start, stop, stream);
   results->set_null_count(input.null_count());
   return results;
 }
@@ -247,6 +268,35 @@ std::unique_ptr<column> rfind(strings_column_view const& input,
   return find_fn<false>(input, target, start, stop, stream, mr);
 }
 
+template <bool forward = true>
+std::unique_ptr<column> find(strings_column_view const& input,
+                             strings_column_view const& target,
+                             size_type start,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
+  CUDF_EXPECTS(input.size() == target.size(), "input and target columns must be the same size");
+
+  // create output column
+  auto results = make_numeric_column(
+    data_type{type_to_id<size_type>()}, input.size(), rmm::device_buffer{}, 0, stream, mr);
+  // if input is empty or all-null then we are done
+  if (input.size() == input.null_count()) { return results; }
+
+  // call find utility with target iterator
+  auto d_targets  = column_device_view::create(target.parent(), stream);
+  auto target_itr = cudf::detail::make_null_replacement_iterator<string_view>(
+    *d_targets, string_view{}, target.has_nulls());
+  find_utility<decltype(target_itr), forward>(input, target_itr, *results, start, -1, stream);
+
+  // AND the bitmasks from input and target
+  auto [null_mask, null_count] =
+    cudf::detail::bitmask_and(table_view({input.parent(), target.parent()}), stream, mr);
+  results->set_null_mask(std::move(null_mask), null_count);
+  return results;
+}
+
 }  // namespace detail
 
 // external APIs
@@ -271,6 +321,16 @@ std::unique_ptr<column> rfind(strings_column_view const& strings,
   return detail::rfind(strings, target, start, stop, cudf::get_default_stream(), mr);
 }
 
+std::unique_ptr<column> find(strings_column_view const& input,
+                             strings_column_view const& target,
+                             size_type start,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::find<true>(input, target, start, stream, mr);
+}
+
 namespace detail {
 namespace {
 
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index b1efeb0d4cb..e64a368a952 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -33,16 +33,17 @@ struct StringsFindTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFindTest, Find)
 {
-  cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},
+  cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lest", "tést strings", ""},
                                              {1, 1, 0, 1, 1, 1});
   auto strings_view = cudf::strings_column_view(strings);
 
   {
+    auto const target = cudf::string_scalar("é");
     cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({1, 4, -1, -1, 1, -1},
                                                                      {1, 1, 0, 1, 1, 1});
-    auto results = cudf::strings::find(strings_view, cudf::string_scalar("é"));
+    auto results = cudf::strings::find(strings_view, target);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::rfind(strings_view, cudf::string_scalar("é"));
+    results = cudf::strings::rfind(strings_view, target);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -51,6 +52,15 @@ TEST_F(StringsFindTest, Find)
     auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("l"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
+  {
+    auto const target = cudf::string_scalar("es");
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({-1, 2, -1, 1, -1, -1},
+                                                                     {1, 1, 0, 1, 1, 1});
+    auto results = cudf::strings::find(strings_view, target);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    results = cudf::strings::rfind(strings_view, target);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
   {
     cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0, 0, 0, 0},
                                                                      {1, 1, 0, 1, 1, 1});
@@ -58,11 +68,38 @@ TEST_F(StringsFindTest, Find)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({5, 5, 0, 5, 12, 0},
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({5, 5, 0, 4, 12, 0},
                                                                      {1, 1, 0, 1, 1, 1});
     auto results = cudf::strings::rfind(strings_view, cudf::string_scalar(""));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
+  {
+    auto const targets = cudf::test::strings_column_wrapper({"l", "t", "", "x", "é", "o"});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({2, 0, 0, -1, 1, -1},
+                                                                     {1, 1, 0, 1, 1, 1});
+    auto results = cudf::strings::find(strings_view, cudf::strings_column_view(targets));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0, 0, 0, 0},
+                                                                     {1, 1, 0, 1, 1, 1});
+    auto results = cudf::strings::find(strings_view, strings_view);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
+TEST_F(StringsFindTest, FindWithNullTargets)
+{
+  cudf::test::strings_column_wrapper input({"hello hello", "thesé help", "", "helicopter", "", "x"},
+                                           {1, 1, 0, 1, 1, 1});
+  auto strings_view = cudf::strings_column_view(input);
+
+  auto const targets = cudf::test::strings_column_wrapper(
+    {"lo he", "", "hhh", "cop", "help", "xyz"}, {1, 0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({3, -1, -1, 4, -1, -1},
+                                                                   {1, 0, 0, 1, 1, 1});
+  auto results = cudf::strings::find(strings_view, cudf::strings_column_view(targets));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsFindTest, FindLongStrings)
@@ -81,9 +118,19 @@ TEST_F(StringsFindTest, FindLongStrings)
     cudf::test::fixed_width_column_wrapper<cudf::size_type>({7, 28, 0, 11, -1, -1, -1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
+  auto targets =
+    cudf::test::strings_column_wrapper({"the", "the", "the", "the", "the", "the", "the"});
+  results = cudf::strings::find(view, cudf::strings_column_view(targets));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
   results  = cudf::strings::rfind(view, cudf::string_scalar("the"));
   expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({7, 48, 0, 77, -1, -1, -1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  targets  = cudf::test::strings_column_wrapper({"there", "cat", "the", "", "the", "are", "dog"});
+  results  = cudf::strings::find(view, cudf::strings_column_view(targets));
+  expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({7, 56, 0, 0, -1, 73, -1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsFindTest, Contains)
@@ -331,6 +378,8 @@ TEST_F(StringsFindTest, ErrorCheck)
   EXPECT_THROW(cudf::strings::find(strings_view, cudf::string_scalar(""), 2, 1), cudf::logic_error);
   EXPECT_THROW(cudf::strings::rfind(strings_view, cudf::string_scalar(""), 2, 1),
                cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find(strings_view, targets_view), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find(strings_view, strings_view, -1), cudf::logic_error);
 }
 
 class FindParmsTest : public StringsFindTest,
@@ -372,6 +421,17 @@ TEST_P(FindParmsTest, Find)
     cudf::test::fixed_width_column_wrapper<cudf::size_type> rexpected({end, 0, end, end, end});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, rexpected);
   }
+  {
+    std::vector<std::string> h_targets({"l", "", "", "l", "s"});
+    std::vector<cudf::size_type> h_expected;
+    for (std::size_t i = 0; i < h_strings.size(); ++i)
+      h_expected.push_back(static_cast<cudf::size_type>(h_strings[i].find(h_targets[i], position)));
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(h_expected.begin(),
+                                                                     h_expected.end());
+    cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+    auto results = cudf::strings::find(strings_view, cudf::strings_column_view(targets), position);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(StringsFindTest,

From fdeababc65d41b52821a47ac025b627d59920f1d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 16 Aug 2023 13:40:36 -0500
Subject: [PATCH 063/230] Disable creation of `DatetimeIndex` when `freq` is
 passed to `cudf.date_range` (#13890)

In https://github.com/rapidsai/cudf/pull/13857/, we disabled the construction of `DatetimeIndex` & `TimedeltaIndex` when their pandas counter-parts have `freq` set. However, when `freq` is passed to `date_range`, the expectation and pandas-behavior is to return a `DateTimeIndex` with `freq` set. This isn't possible with cudf currently, hence disabled the construction of `DatetimeIndex` in this case.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13890
---
 python/cudf/cudf/core/tools/datetimes.py |  4 ++++
 python/cudf/cudf/tests/test_datetime.py  | 13 +++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index f7dea65a401..7c4b9810df2 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -833,6 +833,10 @@ def date_range(
         arr = cp.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
         return cudf.DatetimeIndex._from_data({name: result})
+    elif cudf.get_option("mode.pandas_compatible"):
+        raise NotImplementedError(
+            "`DatetimeIndex` with `freq` cannot be constructed."
+        )
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 4c4657ccba1..abcc057f823 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2133,3 +2133,16 @@ def test_datetime_series_cmpops_pandas_compatibility(data1, data2, op):
 def test_datetime_getitem_na():
     s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]")
     assert s[2] is cudf.NaT
+
+
+def test_daterange_pandas_compatibility():
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(NotImplementedError):
+            cudf.date_range("20010101", "20020215", freq="400h", name="times")
+        expected = pd.date_range(
+            "2010-01-01", "2010-02-01", periods=10, name="times"
+        )
+        actual = cudf.date_range(
+            "2010-01-01", "2010-02-01", periods=10, name="times"
+        )
+    assert_eq(expected, actual)

From 3f995695672945060db6989370329646067cb126 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 16 Aug 2023 14:46:54 -0700
Subject: [PATCH 064/230] Register the memory mapped buffer in `datasource` to
 improve H2D throughput (#13814)

On systems where pageable memory uses host page tables, `cudaHostRegister` is very cheap. Since host buffer registration can improve throughput, datasource now registers the entire memory mapped buffer when host page tables are used.

This mainly impacts the CSV reader, which reads input files using a `host_read` call.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13814
---
 cpp/src/io/utilities/datasource.cpp | 72 +++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 6186d9d9736..7a7121aa91d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -32,6 +32,8 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include <unordered_map>
+
 namespace cudf {
 namespace io {
 namespace {
@@ -107,6 +109,27 @@ class file_source : public datasource {
   static constexpr size_t _gds_read_preferred_threshold = 128 << 10;  // 128KB
 };
 
+/**
+ * @brief Memoized pageableMemoryAccessUsesHostPageTables device property.
+ */
+[[nodiscard]] bool pageableMemoryAccessUsesHostPageTables()
+{
+  static std::unordered_map<int, bool> result_cache{};
+
+  int deviceId{};
+  CUDF_CUDA_TRY(cudaGetDevice(&deviceId));
+
+  if (result_cache.find(deviceId) == result_cache.end()) {
+    cudaDeviceProp props{};
+    CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, deviceId));
+    result_cache[deviceId] = (props.pageableMemoryAccessUsesHostPageTables == 1);
+    CUDF_LOG_INFO(
+      "Device {} pageableMemoryAccessUsesHostPageTables: {}", deviceId, result_cache[deviceId]);
+  }
+
+  return result_cache[deviceId];
+}
+
 /**
  * @brief Implementation class for reading from a file using memory mapped access.
  *
@@ -118,12 +141,18 @@ class memory_mapped_source : public file_source {
   explicit memory_mapped_source(char const* filepath, size_t offset, size_t size)
     : file_source(filepath)
   {
-    if (_file.size() != 0) map(_file.desc(), offset, size);
+    if (_file.size() != 0) {
+      map(_file.desc(), offset, size);
+      register_mmap_buffer();
+    }
   }
 
   ~memory_mapped_source() override
   {
-    if (_map_addr != nullptr) { munmap(_map_addr, _map_size); }
+    if (_map_addr != nullptr) {
+      munmap(_map_addr, _map_size);
+      unregister_mmap_buffer();
+    }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
@@ -150,6 +179,38 @@ class memory_mapped_source : public file_source {
   }
 
  private:
+  /**
+   * @brief Page-locks (registers) the memory range of the mapped file.
+   *
+   * Fixes nvbugs/4215160
+   */
+  void register_mmap_buffer()
+  {
+    if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) {
+      return;
+    }
+
+    auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault);
+    if (result == cudaSuccess) {
+      _is_map_registered = true;
+    } else {
+      CUDF_LOG_WARN("cudaHostRegister failed with {} ({})", result, cudaGetErrorString(result));
+    }
+  }
+
+  /**
+   * @brief Unregisters the memory range of the mapped file.
+   */
+  void unregister_mmap_buffer()
+  {
+    if (not _is_map_registered) { return; }
+
+    auto const result = cudaHostUnregister(_map_addr);
+    if (result != cudaSuccess) {
+      CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})", result, cudaGetErrorString(result));
+    }
+  }
+
   void map(int fd, size_t offset, size_t size)
   {
     CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file");
@@ -168,9 +229,10 @@ class memory_mapped_source : public file_source {
   }
 
  private:
-  size_t _map_size   = 0;
-  size_t _map_offset = 0;
-  void* _map_addr    = nullptr;
+  size_t _map_size        = 0;
+  size_t _map_offset      = 0;
+  void* _map_addr         = nullptr;
+  bool _is_map_registered = false;
 };
 
 /**

From 4fd6dd7b960e497abd13127e8eb7939f168bce08 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 17 Aug 2023 03:22:54 +0530
Subject: [PATCH 065/230] Fix List's missing children metadata in JSON writer
 (#13869)

Fixes  #13800
The children metadata of list column was not constructed in Cython code. This is fixed by constructing all children columns metadata for list column.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13869
---
 python/cudf/cudf/_lib/json.pyx      |  9 ++--
 python/cudf/cudf/tests/test_json.py | 65 ++++++++++++++++++++++++++++-
 2 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 64189a5626f..437c3ef6ec4 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -259,9 +259,10 @@ cdef _set_col_children_metadata(Column col,
                 child_col, col_meta.children[i]
             )
     elif is_list_dtype(col):
-        _set_col_children_metadata(
-            col.children[0],
-            col_meta.children[0]
-        )
+        for i, child_col in enumerate(col.children):
+            col_meta.children.push_back(child_info)
+            _set_col_children_metadata(
+                child_col, col_meta.children[i]
+            )
     else:
         return
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 43b0ca0119a..47f5b99acf7 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -215,6 +215,69 @@ def test_cudf_json_writer_read(gdf_writer_types):
     assert_eq(pdf2, gdf2)
 
 
+@pytest.mark.parametrize(
+    "jsonl_string, expected",
+    [
+        # fixed width
+        ("""{"a":10, "b":1.1}\n {"a":20, "b":2.1}\n""", None),
+        # simple list
+        ("""{"a":[1, 2, 3], "b":1.1}\n {"a":[]}\n""", None),
+        # simple struct
+        ("""{"a":{"c": 123 }, "b":1.1}\n {"a": {"c": 456}}\n""", None),
+        # list of lists
+        ("""{"a":[[], [1, 2], [3, 4]], "b":1.1}\n""", None),
+        ("""{"a":[null, [1, 2], [null, 4]], "b":1.1}\n""", None),
+        # list of structs
+        # error ("""{"a":[null, {}], "b":1.1}\n""", None),
+        (
+            """{"a":[null, {"L": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""",
+            None,
+        ),
+        (
+            """{"a":[{"L": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""",
+            None,
+        ),
+        # struct of lists
+        (
+            """{"a":{"L": [1, 2, 3]}, "b":1.1}\n {"a": {"L": [4, 5, 6]}}\n""",
+            None,
+        ),
+        ("""{"a":{"L": [1, 2, null]}, "b":1.1}\n {"a": {"L": []}}\n""", None),
+        # struct of structs
+        (
+            """{"a":{"L": {"M": 123}}, "b":1.1}
+               {"a": {"L": {"M": 456}}}\n""",
+            None,
+        ),
+        (
+            """{"a":{"L": {"M": null}}, "b":1.1}\n {"a": {"L": {}}}\n""",
+            """{"a":{"L": {}}, "b":1.1}\n {"a": {"L": {}}}\n""",
+        ),
+        # list of structs of lists
+        ("""{"a":[{"L": [1, 2, 3]}, {"L": [4, 5, 6]}], "b":1.1}\n""", None),
+        ("""{"a":[{"L": [1, 2, null]}, {"L": []}], "b":1.1}\n""", None),
+        # struct of lists of structs
+        ("""{"a":{"L": [{"M": 123}, {"M": 456}]}, "b":1.1}\n""", None),
+        (
+            """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""",
+            """{"a":{"L": [{}, {}]}, "b":1.1}\n""",
+        ),
+    ],
+)
+def test_cudf_json_roundtrip(jsonl_string, expected):
+    gdf = cudf.read_json(
+        StringIO(jsonl_string),
+        lines=True,
+        engine="cudf",
+        # dtype=dict(dtypes),
+    )
+    expected = jsonl_string if expected is None else expected
+    gdf_string = gdf.to_json(
+        orient="records", lines=True, engine="cudf", include_nulls=False
+    )
+    assert_eq(gdf_string, expected.replace(" ", ""))
+
+
 @pytest.mark.parametrize("sink", ["string", "file"])
 def test_cudf_json_writer_sinks(sink, tmp_path_factory):
     df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
@@ -1185,7 +1248,7 @@ def test_json_array_of_arrays(data, lines):
         # simple list with mixed types
         """{"a":[123, {}], "b":1.1}""",
         """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        """{"a":[{"0": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
         """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
         """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
         """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",

From 4ea3a7540e5c75848a464444893a1dcf40941dbd Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 16 Aug 2023 20:41:27 -0500
Subject: [PATCH 066/230] Return `Timestamp` & `Timedelta` for fetching scalars
 in `DatetimeIndex` & `TimedeltaIndex` (#13896)

This PR returns `pd.Timestamp` & `pd.Timdelta` respectively for `DatetimeIndex` & `TimedeltaIndex` `__getitem__` in pandas-compatibility mode.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13896
---
 python/cudf/cudf/core/index.py       | 16 ++++++++++++++++
 python/cudf/cudf/tests/test_index.py | 12 ++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b7ee85758b9..458ef2df02d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2109,6 +2109,14 @@ def __init__(
 
         super().__init__(data, **kwargs)
 
+    def __getitem__(self, index):
+        value = super().__getitem__(index)
+        if cudf.get_option("mode.pandas_compatible") and isinstance(
+            value, np.datetime64
+        ):
+            return pd.Timestamp(value)
+        return value
+
     def searchsorted(
         self,
         value,
@@ -2767,6 +2775,14 @@ def __init__(
 
         super().__init__(data, **kwargs)
 
+    def __getitem__(self, index):
+        value = super().__getitem__(index)
+        if cudf.get_option("mode.pandas_compatible") and isinstance(
+            value, np.timedelta64
+        ):
+            return pd.Timedelta(value)
+        return value
+
     @_cudf_nvtx_annotate
     def to_pandas(self, nullable=False):
         return pd.TimedeltaIndex(
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index aefaacdac29..2e6b45058ef 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2682,3 +2682,15 @@ def test_index_date_duration_freq_error(cls):
     with cudf.option_context("mode.pandas_compatible", True):
         with pytest.raises(NotImplementedError):
             cudf.Index(s)
+
+
+@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+def test_index_getitem_time_duration(dtype):
+    gidx = cudf.Index([1, 2, 3, 4, None], dtype=dtype)
+    pidx = gidx.to_pandas()
+    with cudf.option_context("mode.pandas_compatible", True):
+        for i in range(len(gidx)):
+            if i == 4:
+                assert gidx[i] is pidx[i]
+            else:
+                assert_eq(gidx[i], pidx[i])

From 41f0caf53662cfde8146647574e705982eb558b1 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 16 Aug 2023 21:13:15 -0700
Subject: [PATCH 067/230] Enable RLE boolean encoding for v2 Parquet files
 (#13886)

While working on #13707 it was noticed that RLE encoding of booleans had been implemented and then disabled (see [this comment](https://github.com/rapidsai/cudf/pull/13707#discussion_r1275194215) for details). This PR re-enables RLE encoding for booleans, but only when V2 headers are being used.

Part of #13501.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/13886
---
 cpp/src/io/parquet/page_enc.cu     | 122 +++++++++++++++++++----------
 cpp/src/io/parquet/parquet_gpu.hpp |  20 +++--
 cpp/src/io/parquet/writer_impl.cu  |   5 +-
 3 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 9f4c0ba943a..20993d12af8 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -47,12 +47,6 @@ namespace parquet {
 namespace gpu {
 
 namespace {
-// Spark doesn't support RLE encoding for BOOLEANs
-#ifdef ENABLE_BOOL_RLE
-constexpr bool enable_bool_rle = true;
-#else
-constexpr bool enable_bool_rle = false;
-#endif
 
 using ::cudf::detail::device_2dspan;
 
@@ -70,6 +64,9 @@ constexpr uint32_t WARP_MASK = cudf::detail::warp_size - 1;
 // currently 64k - 1
 constexpr uint32_t MAX_GRID_Y_SIZE = (1 << 16) - 1;
 
+// space needed for RLE length field
+constexpr int RLE_LENGTH_FIELD_LEN = 4;
+
 struct frag_init_state_s {
   parquet_column_device_view col;
   PageFragment frag;
@@ -78,6 +75,7 @@ struct frag_init_state_s {
 struct page_enc_state_s {
   uint8_t* cur;          //!< current output ptr
   uint8_t* rle_out;      //!< current RLE write ptr
+  uint8_t* rle_len_pos;  //!< position to write RLE length (for V2 boolean data)
   uint32_t rle_run;      //!< current RLE run
   uint32_t run_val;      //!< current RLE run value
   uint32_t rle_pos;      //!< RLE encoder positions
@@ -210,6 +208,27 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
   }
 }
 
+Encoding __device__ determine_encoding(PageType page_type,
+                                       Type physical_type,
+                                       bool use_dictionary,
+                                       bool write_v2_headers)
+{
+  // NOTE: For dictionary encoding, parquet v2 recommends using PLAIN in dictionary page and
+  // RLE_DICTIONARY in data page, but parquet v1 uses PLAIN_DICTIONARY in both dictionary and
+  // data pages (actual encoding is identical).
+  switch (page_type) {
+    case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN;
+    case PageType::DATA_PAGE_V2:
+      // TODO need to work in delta encodings here when they're added
+      return physical_type == BOOLEAN ? Encoding::RLE
+             : use_dictionary         ? Encoding::RLE_DICTIONARY
+                                      : Encoding::PLAIN;
+    case PageType::DICTIONARY_PAGE:
+      return write_v2_headers ? Encoding::PLAIN : Encoding::PLAIN_DICTIONARY;
+    default: CUDF_UNREACHABLE("unsupported page type");
+  }
+}
+
 }  // anonymous namespace
 
 // blockDim {512,1,1}
@@ -384,6 +403,11 @@ __global__ void __launch_bounds__(128)
       num_pages = 1;
     }
     __syncwarp();
+
+    // page padding needed for RLE encoded boolean data
+    auto const rle_pad =
+      write_v2_headers && col_g.physical_type == BOOLEAN ? RLE_LENGTH_FIELD_LEN : 0;
+
     // This loop goes over one page fragment at a time and adds it to page.
     // When page size crosses a particular limit, then it moves on to the next page and then next
     // page fragment gets added to that one.
@@ -427,12 +451,12 @@ __global__ void __launch_bounds__(128)
       // override this_max_page_size if the requested size is smaller
       this_max_page_size = min(this_max_page_size, max_page_size_bytes);
 
-      // subtract size of rep and def level vectors
-      auto num_vals = values_in_page + frag_g.num_values;
-      this_max_page_size =
-        underflow_safe_subtract(this_max_page_size,
-                                max_RLE_page_size(col_g.num_def_level_bits(), num_vals) +
-                                  max_RLE_page_size(col_g.num_rep_level_bits(), num_vals));
+      // subtract size of rep and def level vectors and RLE length field
+      auto num_vals      = values_in_page + frag_g.num_values;
+      this_max_page_size = underflow_safe_subtract(
+        this_max_page_size,
+        max_RLE_page_size(col_g.num_def_level_bits(), num_vals) +
+          max_RLE_page_size(col_g.num_rep_level_bits(), num_vals) + rle_pad);
 
       // checks to see when we need to close the current page and start a new one
       auto const is_last_chunk          = num_rows >= ck_g.num_rows;
@@ -474,7 +498,7 @@ __global__ void __launch_bounds__(128)
           page_g.num_values         = values_in_page;
           auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page);
           auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page);
-          auto const max_data_size  = page_size + def_level_size + rep_level_size;
+          auto const max_data_size  = page_size + def_level_size + rep_level_size + rle_pad;
           // page size must fit in 32-bit signed integer
           if (max_data_size > std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
@@ -967,7 +991,8 @@ __global__ void __launch_bounds__(128, 8)
   gpuEncodePages(device_span<gpu::EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
-                 device_span<compression_result> comp_results)
+                 device_span<compression_result> comp_results,
+                 bool write_v2_headers)
 {
   __shared__ __align__(8) page_enc_state_s state_g;
   using block_reduce = cub::BlockReduce<uint32_t, block_size>;
@@ -990,6 +1015,7 @@ __global__ void __launch_bounds__(128, 8)
     s->page.def_lvl_bytes = 0;
     s->page.rep_lvl_bytes = 0;
     s->page.num_nulls     = 0;
+    s->rle_len_pos        = nullptr;
   }
   __syncthreads();
 
@@ -1132,9 +1158,15 @@ __global__ void __launch_bounds__(128, 8)
     s->rle_pos     = 0;
     s->rle_numvals = 0;
     s->rle_out     = dst;
+    s->page.encoding =
+      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
     if (dict_bits >= 0 && physical_type != BOOLEAN) {
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
+    } else if (is_v2 && physical_type == BOOLEAN) {
+      // save space for RLE length. we don't know the total length yet.
+      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
+      s->rle_len_pos = dst;
     }
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
@@ -1188,7 +1220,7 @@ __global__ void __launch_bounds__(128, 8)
         }
         rle_numvals += rle_numvals_in_block;
         __syncthreads();
-        if ((!enable_bool_rle) && (physical_type == BOOLEAN)) {
+        if (!is_v2 && physical_type == BOOLEAN) {
           PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
         } else {
           RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
@@ -1345,22 +1377,29 @@ __global__ void __launch_bounds__(128, 8)
 
   uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
 
+  // save RLE length if necessary
+  if (s->rle_len_pos != nullptr && t < 32) {
+    // size doesn't include the 4 bytes for the length
+    auto const rle_size = static_cast<uint32_t>(s->cur - s->rle_len_pos) - RLE_LENGTH_FIELD_LEN;
+    if (t < RLE_LENGTH_FIELD_LEN) { s->rle_len_pos[t] = rle_size >> (t * 8); }
+    __syncwarp();
+  }
+
+  // V2 does not compress rep and def level data
+  size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+
   if (t == 0) {
-    s->page.num_nulls     = s->page.num_values - valid_count;
-    uint8_t* base         = s->page.page_data + s->page.max_hdr_size;
-    auto actual_data_size = static_cast<uint32_t>(s->cur - base);
+    s->page.num_nulls           = s->page.num_values - valid_count;
+    uint8_t* const base         = s->page.page_data + s->page.max_hdr_size;
+    auto const actual_data_size = static_cast<uint32_t>(s->cur - base);
     if (actual_data_size > s->page.max_data_size) {
       CUDF_UNREACHABLE("detected possible page data corruption");
     }
     s->page.max_data_size = actual_data_size;
     if (not comp_in.empty()) {
-      // V2 does not compress rep and def level data
-      size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
-      comp_in[blockIdx.x]         = {base + skip_comp_size, actual_data_size - skip_comp_size};
+      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
       comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
                               0};  // size is unused
-      // copy uncompressed bytes over
-      memcpy(s->page.compressed_data + s->page.max_hdr_size, base, skip_comp_size);
     }
     pages[blockIdx.x] = s->page;
     if (not comp_results.empty()) {
@@ -1368,6 +1407,15 @@ __global__ void __launch_bounds__(128, 8)
       pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
     }
   }
+
+  // copy over uncompressed data
+  if (skip_comp_size != 0 && not comp_in.empty()) {
+    uint8_t const* const src = s->page.page_data + s->page.max_hdr_size;
+    uint8_t* const dst       = s->page.compressed_data + s->page.max_hdr_size;
+    for (int i = t; i < skip_comp_size; i += block_size) {
+      dst[i] = src[i];
+    }
+  }
 }
 
 constexpr int decide_compression_warps_in_block = 4;
@@ -1865,28 +1913,16 @@ __global__ void __launch_bounds__(128)
     }
     header_encoder encoder(hdr_start);
     PageType page_type = page_g.page_type;
-    // NOTE: For dictionary encoding, parquet v2 recommends using PLAIN in dictionary page and
-    // RLE_DICTIONARY in data page, but parquet v1 uses PLAIN_DICTIONARY in both dictionary and
-    // data pages (actual encoding is identical).
-    Encoding encoding;
-    if (enable_bool_rle) {
-      encoding = (col_g.physical_type == BOOLEAN) ? Encoding::RLE
-                 : (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary)
-                   ? Encoding::PLAIN_DICTIONARY
-                   : Encoding::PLAIN;
-    } else {
-      encoding = (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary)
-                   ? Encoding::PLAIN_DICTIONARY
-                   : Encoding::PLAIN;
-    }
+
     encoder.field_int32(1, page_type);
     encoder.field_int32(2, uncompressed_page_size);
     encoder.field_int32(3, compressed_page_size);
+
     if (page_type == PageType::DATA_PAGE) {
       // DataPageHeader
       encoder.field_struct_begin(5);
       encoder.field_int32(1, page_g.num_values);  // NOTE: num_values != num_rows for list types
-      encoder.field_int32(2, encoding);           // encoding
+      encoder.field_int32(2, page_g.encoding);    // encoding
       encoder.field_int32(3, Encoding::RLE);      // definition_level_encoding
       encoder.field_int32(4, Encoding::RLE);      // repetition_level_encoding
       // Optionally encode page-level statistics
@@ -1898,11 +1934,12 @@ __global__ void __launch_bounds__(128)
       }
       encoder.field_struct_end(5);
     } else if (page_type == PageType::DATA_PAGE_V2) {
+      // DataPageHeaderV2
       encoder.field_struct_begin(8);
       encoder.field_int32(1, page_g.num_values);
       encoder.field_int32(2, page_g.num_nulls);
       encoder.field_int32(3, page_g.num_rows);
-      encoder.field_int32(4, encoding);
+      encoder.field_int32(4, page_g.encoding);
       encoder.field_int32(5, page_g.def_lvl_bytes);
       encoder.field_int32(6, page_g.rep_lvl_bytes);
       encoder.field_bool(7, ck_g.is_compressed);  // TODO can compress at page level now
@@ -1918,7 +1955,7 @@ __global__ void __launch_bounds__(128)
       // DictionaryPageHeader
       encoder.field_struct_begin(7);
       encoder.field_int32(1, ck_g.num_dict_entries);  // number of values in dictionary
-      encoder.field_int32(2, encoding);
+      encoder.field_int32(2, page_g.encoding);
       encoder.field_struct_end(7);
     }
     encoder.end(&hdr_end, false);
@@ -2237,6 +2274,7 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
 }
 
 void EncodePages(device_span<gpu::EncPage> pages,
+                 bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
@@ -2245,8 +2283,8 @@ void EncodePages(device_span<gpu::EncPage> pages,
   auto num_pages = pages.size();
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128>
-    <<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_out, comp_results);
+  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(
+    pages, comp_in, comp_out, comp_results, write_v2_headers);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index b3c9d0765fa..fc4ad026b61 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -411,6 +411,7 @@ struct EncPage {
   uint8_t* compressed_data;  //!< Ptr to compressed page
   uint16_t num_fragments;    //!< Number of fragments in page
   PageType page_type;        //!< Page type
+  Encoding encoding;         //!< Encoding used for page data
   EncColumnChunk* chunk;     //!< Chunk that this page belongs to
   uint32_t chunk_id;         //!< Index in chunk array
   uint32_t hdr_size;         //!< Size of page header
@@ -672,13 +673,18 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
 /**
  * @brief Launches kernel for packing column data into parquet pages
  *
+ * If compression is to be used, `comp_in`, `comp_out`, and `comp_res` will be initialized for
+ * use in subsequent compression operations.
+ *
  * @param[in,out] pages Device array of EncPages (unordered)
+ * @param[in] write_v2_headers True if V2 page headers should be written
  * @param[out] comp_in Compressor input buffers
- * @param[out] comp_in Compressor output buffers
- * @param[out] comp_stats Compressor results
- * @param[in] stream CUDA stream to use, default 0
+ * @param[out] comp_out Compressor output buffers
+ * @param[out] comp_res Compressor results
+ * @param[in] stream CUDA stream to use
  */
 void EncodePages(device_span<EncPage> pages,
+                 bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_res,
@@ -688,7 +694,7 @@ void EncodePages(device_span<EncPage> pages,
  * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision
  *
  * @param[in,out] chunks Column chunks (updated with actual compressed/uncompressed sizes)
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
 
@@ -696,10 +702,10 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
  * @brief Launches kernel to encode page headers
  *
  * @param[in,out] pages Device array of EncPages
- * @param[in] comp_stats Compressor status
+ * @param[in] comp_res Compressor status
  * @param[in] page_stats Optional page-level statistics to be included in page header
  * @param[in] chunk_stats Optional chunk-level statistics to be encoded
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void EncodePageHeaders(device_span<EncPage> pages,
                        device_span<compression_result const> comp_res,
@@ -712,7 +718,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
  *
  * @param[in,out] chunks Column chunks
  * @param[in] pages Device array of EncPages
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void GatherPages(device_span<EncColumnChunk> chunks,
                  device_span<gpu::EncPage const> pages,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 06e7b6bfc8a..021b6cffa5a 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1266,6 +1266,7 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param comp_stats optional compression statistics (nullopt if none)
  * @param compression compression format
  * @param column_index_truncate_length maximum length of min or max values in column index, in bytes
+ * @param write_v2_headers True if V2 page headers should be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
@@ -1280,6 +1281,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                   std::optional<writer_compression_statistics>& comp_stats,
                   Compression compression,
                   int32_t column_index_truncate_length,
+                  bool write_v2_headers,
                   rmm::cuda_stream_view stream)
 {
   auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch);
@@ -1300,7 +1302,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  gpu::EncodePages(batch_pages, comp_in, comp_out, comp_res, stream);
+  gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
     case parquet::Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
@@ -1926,6 +1928,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       comp_stats,
       compression,
       column_index_truncate_length,
+      write_v2_headers,
       stream);
 
     bool need_sync{false};

From f543dfa1356f02ae6b581e3e2584fffccfc69c76 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 17 Aug 2023 11:19:19 -0400
Subject: [PATCH 068/230] Fix Byte-Pair-Encoding usage of cuco static-map for
 storing merge-pairs (#13807)

Switching to use `cuco::experimental::static_map` for storing the unique merge-pair strings that can be looked up by `string_view`. This takes advantage of a feature of the `static_map` that allows storing with one key (index to a string entry) and lookup with a different type (string). The map uses a hash on the string for storing the index but allows lookup by string since the hash of string can resolve the entry and duplicates can be resolved by comparing the string with row entries.

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13807
---
 cpp/include/nvtext/bpe_tokenize.hpp      | 15 +----
 cpp/src/text/subword/bpe_tokenizer.cu    | 43 ++++++---------
 cpp/src/text/subword/bpe_tokenizer.cuh   | 70 +++++++++++++++++++++---
 cpp/src/text/subword/load_merges_file.cu | 48 ++++++----------
 4 files changed, 98 insertions(+), 78 deletions(-)

diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index b93d93b07c6..c67f4bd8b1c 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,19 +61,6 @@ struct bpe_merge_pairs {
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
-
-  /**
-   * @brief Returns the number of merge pairs in the table.
-   *
-   * @return The number of merge pairs in the table
-   */
-  cudf::size_type get_size();
-  /**
-   * @brief  Returns the number of unique merge pairs in the table.
-   *
-   * @return The number of unique merge pairs in the table
-   */
-  std::size_t get_map_size();
 };
 
 /**
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index ac55fe76db1..4c4f5b3a4b1 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -80,10 +80,11 @@ __device__ cudf::string_view get_first_token(cudf::string_view const& d_str)
  *
  * @see The byte_pair_encoding_fn::operator() function below for details.
  */
+template <typename MapRefType>
 struct byte_pair_encoding_fn {
   cudf::column_device_view const d_merges;
   cudf::column_device_view const d_strings;
-  merge_pairs_map_type::device_view const d_map;
+  MapRefType const d_map;
   cudf::size_type* d_sizes;  // output size of encoded string
   string_hasher_type const hasher;
   cudf::size_type* d_byte_indices;
@@ -136,17 +137,13 @@ struct byte_pair_encoding_fn {
   }
 
   /**
-   * @brief Compute the hash over the input strings.
+   * @brief Look up the pair of strings in the d_map/d_merges
    *
-   * The input strings are combined with a space to produce hash for matching
-   * a merge pair within the `d_map`.
-   *
-   * @param lhs First string.
-   * @param rhs Second string.
-   * @return The hash value to match with `d_map`.
+   * @param lhs Left half of the string
+   * @param rhs Right half of the string
+   * @return Position of merge pair within d_map
    */
-  __device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs,
-                                                cudf::string_view const& rhs)
+  __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs)
   {
     __shared__ char shmem[48 * 1024];  // max for Pascal
     auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
@@ -154,8 +151,8 @@ struct byte_pair_encoding_fn {
 
     // Edge case check.
     // Empirically found only two merge pair strings that were greater than 70 bytes
-    // and they both looked like ignorable errors. Double check this analysis with Vibhu.
-    if (thread_memory_size < total_size) { return 0; }
+    // and they both looked like ignorable errors.
+    if (thread_memory_size < total_size) { return d_map.end(); }
 
     // build the target string in shared memory
     char* ptr = &shmem[threadIdx.x * thread_memory_size];
@@ -165,8 +162,8 @@ struct byte_pair_encoding_fn {
     memcpy(ptr + lhs.size_bytes(), " ", 1);
     memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes());
 
-    auto const d_hash_str = cudf::string_view(ptr, total_size);
-    return hasher(d_hash_str);  // return the hash for the temp string
+    auto const d_str = cudf::string_view(ptr, total_size);
+    return d_map.find(d_str);
   }
 
   /**
@@ -233,11 +230,10 @@ struct byte_pair_encoding_fn {
         auto const rhs = next_substr(itr, end, d_str);
         if (rhs.empty()) break;  // no more adjacent pairs
 
-        auto const hash    = compute_hash(lhs, rhs);
-        auto const map_itr = d_map.find(hash, thrust::identity<cudf::hash_value_type>{});
+        auto const map_itr = get_merge_pair(lhs, rhs);
         if (map_itr != d_map.end()) {
           // found a match; record the rank (and other min_ vars)
-          auto const rank = static_cast<cudf::size_type>(map_itr->second);
+          auto const rank = map_itr->second;
           if (rank < min_rank) {
             min_rank = rank;
             min_itr  = itr;
@@ -354,12 +350,12 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(!merge_pairs.get_merge_pairs().is_empty(), "Merge pairs table must not be empty");
+  auto const d_merges = merge_pairs.get_merge_pairs();
+  CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty");
 
   // build working vector to hold index values per byte
   rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
 
-  auto const d_merges  = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream);
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
   auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
@@ -369,12 +365,9 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
                                            rmm::mr::get_current_device_resource());
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
-  byte_pair_encoding_fn fn{*d_merges,
-                           *d_strings,
-                           merge_pairs.get_merge_pairs_map(),
-                           d_offsets,
-                           string_hasher_type{},
-                           d_byte_indices.data()};
+  auto map_ref = merge_pairs.get_merge_pairs_ref();
+  byte_pair_encoding_fn<decltype(map_ref)> fn{
+    d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
   thrust::for_each_n(
     rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
 
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 0697a9961c7..83aa22aaae9 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -21,7 +21,9 @@
 #include <hash/hash_allocator.cuh>
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/strings/string_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -30,30 +32,84 @@
 #include <cuco/static_map.cuh>
 
 #include <cstdint>
+#include <type_traits>
 
 namespace nvtext {
 namespace detail {
 
+using hash_value_type    = uint32_t;
+using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct bpe_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    return hasher(d_strings.element<cudf::string_view>(index));
+  }
+  // used by find
+  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+};
+
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct bpe_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(lhs) == d_strings.element<cudf::string_view>(rhs);
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(lhs) == rhs;
+  }
+};
+
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
-                                              cudf::size_type,
-                                              cuda::thread_scope_device,
-                                              hash_table_allocator_type>;
+using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
 
-using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
+                                                            cudf::size_type,
+                                                            cuco::experimental::extent<std::size_t>,
+                                                            cuda::thread_scope_device,
+                                                            bpe_equal,
+                                                            probe_scheme,
+                                                            hash_table_allocator_type>;
 
 }  // namespace detail
 
+// since column_device_view::create returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type in a more maintainable
+// way
+using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
+                                             cudf::column_view,
+                                             rmm::cuda_stream_view>;
+
 struct bpe_merge_pairs::bpe_merge_pairs_impl {
   std::unique_ptr<cudf::column> const merge_pairs;
+  col_device_view const d_merge_pairs;
   std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
 
   bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
+                       col_device_view&& d_merge_pairs,
                        std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
 
-  auto get_merge_pairs() const { return merge_pairs->view(); }
-  auto get_merge_pairs_map() const { return merge_pairs_map->get_device_view(); }
+  auto const get_merge_pairs() const { return *d_merge_pairs; }
+  auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
 };
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index b39413af98f..1f1b90b3f49 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -36,23 +36,8 @@
 
 namespace nvtext {
 namespace detail {
-
 namespace {
 
-struct make_pair_function {
-  /**
-   * @brief Hash the merge pair entry
-   */
-  __device__ cuco::pair<cudf::hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
-  {
-    auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
-    return cuco::make_pair(result, idx);
-  }
-
-  string_hasher_type const _hasher;
-  cudf::column_device_view const d_strings;
-};
-
 /**
  * @brief Loads a text file of merge-pairs into a strings column.
  *
@@ -101,26 +86,23 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 }
 
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
-  cudf::strings_column_view const& input, rmm::cuda_stream_view stream)
+  cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
   // Ensure capacity is at least (size/0.7) as documented here:
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),  // capacity is 2x;
-    cuco::empty_key{std::numeric_limits<cudf::hash_value_type>::max()},
+    cuco::empty_key{-1},
     cuco::empty_value{-1},                  // empty value is not used
+    bpe_equal{input},
+    probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
-  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
-  make_pair_function pair_func{string_hasher_type{}, *d_strings};
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
 
-  merge_pairs_map->insert(iter,
-                          iter + input.size(),
-                          thrust::identity<cudf::hash_value_type>{},
-                          thrust::equal_to<cudf::hash_value_type>{},
-                          stream.value());
+  merge_pairs_map->insert_async(iter, iter + input.size(), stream.value());
 
   return merge_pairs_map;
 }
@@ -128,9 +110,10 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
 {
-  auto merge_pairs = initialize_merge_pairs_map(cudf::strings_column_view(input->view()), stream);
-  return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(std::move(input),
-                                                                         std::move(merge_pairs));
+  auto d_input     = cudf::column_device_view::create(input->view(), stream);
+  auto merge_pairs = initialize_merge_pairs_map(*d_input, stream);
+  return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
+    std::move(input), std::move(d_input), std::move(merge_pairs));
 }
 
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
@@ -163,8 +146,12 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
 
 bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& merge_pairs,
+  std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
+    d_merge_pairs,
   std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map)
-  : merge_pairs(std::move(merge_pairs)), merge_pairs_map(std::move(merge_pairs_map))
+  : merge_pairs(std::move(merge_pairs)),
+    d_merge_pairs(std::move(d_merge_pairs)),
+    merge_pairs_map(std::move(merge_pairs_map))
 {
 }
 
@@ -184,7 +171,4 @@ bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
 
 bpe_merge_pairs::~bpe_merge_pairs() = default;
 
-cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); }
-std::size_t bpe_merge_pairs::get_map_size() { return impl->merge_pairs_map->get_size(); }
-
 }  // namespace nvtext

From 42cab26368c7f575097b36cc2ed8360aeeca5cbe Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 17 Aug 2023 15:40:41 -0500
Subject: [PATCH 069/230] Use cuda-nvtx-dev CUDA 12 package. (#13901)

Fixes the CUDA 12 `cuda-nvtx-dev` package name. This issue was found by @davidwendt and I verified that this fixes it.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Ray Douglass (https://github.com/raydouglass)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/13901
---
 .../all_cuda-118_arch-x86_64.yaml             |  1 +
 .../all_cuda-120_arch-x86_64.yaml             |  1 +
 dependencies.yaml                             | 21 +++++--------------
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index dca0e3fe901..e9f97a63db7 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,6 +16,7 @@ dependencies:
 - cachetools
 - cmake>=3.26.4
 - cubinlinker
+- cuda-nvtx=11.8
 - cuda-python>=11.7.1,<12.0a0
 - cuda-sanitizer-api=11.8.86
 - cuda-version=11.8
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index a9d0b260aee..dd85db528a6 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -18,6 +18,7 @@ dependencies:
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvrtc-dev
+- cuda-nvtx-dev
 - cuda-python>=12.0,<13.0a0
 - cuda-sanitizer-api
 - cuda-version=12.0
diff --git a/dependencies.yaml b/dependencies.yaml
index 0da5dbbb5ad..79db1f16947 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -289,12 +289,14 @@ dependencies:
               - cuda-version=12.0
               - cuda-cudart-dev
               - cuda-nvrtc-dev
+              - cuda-nvtx-dev
               - libcurand-dev
           - matrix:
               cuda: "11.8"
             packages:
               - cuda-version=11.8
               - cudatoolkit
+              - cuda-nvtx=11.8
               - libcurand-dev=10.3.0.86
               - libcurand=10.3.0.86
           - matrix:
@@ -302,6 +304,7 @@ dependencies:
             packages:
               - cuda-version=11.5
               - cudatoolkit
+              - cuda-nvtx=11.5
                 # Can't hard pin the version since 11.x is missing many
                 # packages for specific versions
               - libcurand-dev>=10.2.6.48,<=10.2.7.107
@@ -311,6 +314,7 @@ dependencies:
             packages:
               - cuda-version=11.4
               - cudatoolkit
+              - &cudanvtx114 cuda-nvtx=11.4
               - &libcurand_dev114 libcurand-dev>=10.2.5.43,<=10.2.5.120
               - &libcurand114 libcurand>=10.2.5.43,<=10.2.5.120
           - matrix:
@@ -321,6 +325,7 @@ dependencies:
                 # The NVIDIA channel doesn't publish pkgs older than 11.4 for
                 # these libs, so 11.2 uses 11.4 packages (the oldest
                 # available).
+              - *cudanvtx114
               - *libcurand_dev114
               - *libcurand114
       - output_types: conda
@@ -535,22 +540,6 @@ dependencies:
           - *cmake_ver
           - maven
           - openjdk=8.*
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "12.0"
-            packages:
-              - cuda-version=12.0
-              - cuda-nvtx
-          - matrix:
-              cuda: "11.8"
-            packages:
-              - cuda-nvtx=11.8
-          - matrix:
-              cuda: "11.5"
-            packages:
-              - cuda-nvtx=11.5
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]

From 0cc8a5459fd0abaf1ab03fc773674887539213f6 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 17 Aug 2023 17:28:26 -0700
Subject: [PATCH 070/230] Correctly detect the BOM mark in `read_csv` with
 compressed input (#13881)

We currently detect the BOM mark at the start of the file and then decompress the data, if compressed. This leads to a problem with compressed files that have a BOM mark, as we are trying to detect the mark in the compressed data.
This PR moves the BOM detection until after decompression. Also cleaned up the surrounding code slightly.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13881
---
 cpp/src/io/csv/reader_impl.cu      | 27 +++++++++++++--------------
 python/cudf/cudf/tests/test_csv.py | 12 ++++++++++++
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 16667e4394d..478cf2957bc 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -410,28 +410,27 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> select_data_and_row_
 
   // Transfer source data to GPU
   if (!source->is_empty()) {
-    auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
-    auto buffer    = source->host_read(range_offset, data_size);
-
-    // check for and skip UTF-8 BOM
-    auto buffer_data         = buffer->data();
-    auto buffer_size         = buffer->size();
-    uint8_t const UTF8_BOM[] = {0xEF, 0xBB, 0xBF};
-    if (buffer_size > sizeof(UTF8_BOM) && memcmp(buffer_data, UTF8_BOM, sizeof(UTF8_BOM)) == 0) {
-      buffer_data += sizeof(UTF8_BOM);
-      buffer_size -= sizeof(UTF8_BOM);
-    }
-
-    auto h_data = host_span<char const>(reinterpret_cast<char const*>(buffer_data), buffer_size);
+    auto buffer =
+      source->host_read(range_offset, range_size_padded != 0 ? range_size_padded : source->size());
+    auto h_data =
+      host_span<char const>(reinterpret_cast<char const*>(buffer->data()), buffer->size());
 
     std::vector<uint8_t> h_uncomp_data_owner;
-
     if (reader_opts.get_compression() != compression_type::NONE) {
       h_uncomp_data_owner =
         decompress(reader_opts.get_compression(), {buffer->data(), buffer->size()});
       h_data = {reinterpret_cast<char const*>(h_uncomp_data_owner.data()),
                 h_uncomp_data_owner.size()};
+      buffer.reset();
     }
+
+    // check for and skip UTF-8 BOM
+    uint8_t const UTF8_BOM[] = {0xEF, 0xBB, 0xBF};
+    if (h_data.size() >= sizeof(UTF8_BOM) &&
+        memcmp(h_data.data(), UTF8_BOM, sizeof(UTF8_BOM)) == 0) {
+      h_data = h_data.subspan(sizeof(UTF8_BOM), h_data.size() - sizeof(UTF8_BOM));
+    }
+
     // None of the parameters for row selection is used, we are parsing the entire file
     bool const load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
                                  skip_end_rows <= 0 && num_rows == -1;
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index b34340295f2..ff82ca802aa 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
+import codecs
 import gzip
 import os
 import re
@@ -2222,3 +2223,14 @@ def test_column_selection_plus_column_names(usecols, names):
         pd.read_csv(StringIO(buffer), usecols=usecols, names=names),
         cudf.read_csv(StringIO(buffer), usecols=usecols, names=names),
     )
+
+
+def test_read_compressed_BOM(tmpdir):
+    buffer = 'int, string\n1, "a"\n2, "b"\n3, "c"\n'
+
+    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file20.gz")
+    with gzip.open(fname, "wt", encoding="utf-8") as f:
+        f.write(codecs.BOM_UTF8.decode("utf-8"))
+        f.write(buffer)
+
+    assert_eq(pd.read_csv(fname), cudf.read_csv(fname))

From fb0277e2adf7550cbedd8941df23cd1c24f57888 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 17 Aug 2023 20:15:23 -0500
Subject: [PATCH 071/230] [JNI] Adds HostColumnVector.EventHandler for
 spillability checks (#13898)

This change adds the exact same API for event handling that was added to the device-side `ColumnVector` here https://github.com/rapidsai/cudf/pull/13279. We are going to need this to make `HostColumnVector`, or a batch of them,  spillable in this spark-rapids feature: https://github.com/NVIDIA/spark-rapids/issues/8882.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/13898
---
 .../java/ai/rapids/cudf/ColumnVector.java     |  4 +-
 .../java/ai/rapids/cudf/HostColumnVector.java | 45 ++++++++++++++++++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 29 ++++++++++++
 3 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 0595d58c7cc..30e92d2367f 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -50,8 +50,8 @@ public interface EventHandler {
      *
      * @note the callback is invoked with this `ColumnVector`'s lock held.
      *
-     * @param cv - a reference to the ColumnVector we are closing
-     * @param refCount - the updated ref count for this ColumnVector at the time
+     * @param cv reference to the ColumnVector we are closing
+     * @param refCount the updated ref count for this ColumnVector at the time
      *                 of invocation
      */
     void onClosed(ColumnVector cv, int refCount);
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 6cb7767784a..7993989825d 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -39,12 +39,31 @@
  * and call incRefCount to increment the reference count.
  */
 public final class HostColumnVector extends HostColumnVectorCore {
+  /**
+   * Interface to handle events for this HostColumnVector. Only invoked during
+   * close, hence `onClosed` is the only event.
+   */
+  public interface EventHandler {
+    /**
+     * `onClosed` is invoked with the updated `refCount` during `close`.
+     * The last invocation of `onClosed` will be with `refCount=0`.
+     *
+     * @note the callback is invoked with this `HostColumnVector`'s lock held.
+     *
+     * @param cv reference to the HostColumnVector we are closing
+     * @param refCount the updated ref count for this HostColumnVector at
+     *                 the time of invocation
+     */
+    void onClosed(HostColumnVector cv, int refCount);
+  }
+
   /**
    * The size in bytes of an offset entry
    */
   static final int OFFSET_SIZE = DType.INT32.getSizeInBytes();
 
   private int refCount;
+  private EventHandler eventHandler;
 
   /**
    * Create a new column vector with data populated on the host.
@@ -93,6 +112,27 @@ public HostColumnVector(DType type, long rows, Optional<Long> nullCount,
     incRefCountInternal(true);
   }
 
+  /**
+   * Set an event handler for this host vector. This method can be invoked with
+   * null to unset the handler.
+   *
+   * @param newHandler - the EventHandler to use from this point forward
+   * @return the prior event handler, or null if not set.
+   */
+  public synchronized EventHandler setEventHandler(EventHandler newHandler) {
+    EventHandler prev = this.eventHandler;
+    this.eventHandler = newHandler;
+    return prev;
+  }
+
+  /**
+   * Returns the current event handler for this HostColumnVector or null if no
+   * handler is associated.
+   */
+  public synchronized EventHandler getEventHandler() {
+    return this.eventHandler;
+  }
+
   /**
    * This is a really ugly API, but it is possible that the lifecycle of a column of
    * data may not have a clear lifecycle thanks to java and GC. This API informs the leak
@@ -110,6 +150,9 @@ public void noWarnLeakExpected() {
   public synchronized void close() {
     refCount--;
     offHeap.delRef();
+    if (eventHandler != null) {
+      eventHandler.onClosed(this, refCount);
+    }
     if (refCount == 0) {
       offHeap.clean(false);
       for( HostColumnVectorCore child : children) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index b462d70ccd2..0e1fbad6129 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -6791,6 +6791,18 @@ public void testEventHandlerIsCalledForEachClose() {
     assertEquals(1, onClosedWasCalled.get());
   }
 
+  @Test
+  public void testHostEventHandlerIsCalledForEachClose() {
+    final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
+    try (HostColumnVector cv = HostColumnVector.fromInts(1,2,3,4)) {
+      cv.setEventHandler((col, refCount) -> {
+        assertEquals(cv, col);
+        onClosedWasCalled.incrementAndGet();
+      });
+    }
+    assertEquals(1, onClosedWasCalled.get());
+  }
+
   @Test
   public void testEventHandlerIsNotCalledIfNotSet() {
     final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
@@ -6808,6 +6820,23 @@ public void testEventHandlerIsNotCalledIfNotSet() {
     assertEquals(0, onClosedWasCalled.get());
   }
 
+  @Test
+  public void testHostEventHandlerIsNotCalledIfNotSet() {
+    final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
+    try (HostColumnVector cv = HostColumnVector.fromInts(1,2,3,4)) {
+      assertNull(cv.getEventHandler());
+    }
+    assertEquals(0, onClosedWasCalled.get());
+
+    try (HostColumnVector cv = HostColumnVector.fromInts(1,2,3,4)) {
+      cv.setEventHandler((col, refCount) -> {
+        onClosedWasCalled.incrementAndGet();
+      });
+      cv.setEventHandler(null);
+    }
+    assertEquals(0, onClosedWasCalled.get());
+  }
+
   /**
    * Test that the ColumnView with unknown null-counts still returns
    * the correct null-count when queried.

From 28b5b6e59e49343496e441ea45a843b4dd8d1bf3 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Thu, 17 Aug 2023 20:53:55 -0500
Subject: [PATCH 072/230] Add noSanitizer tag to Java reduction tests failing
 with sanitizer in CUDA 12 (#13904)

Relates to NVIDIA/spark-rapids-jni#1349.  The Java ReductionTest unit tests are failing when run under CUDA 12's compute-sanitizer but pass when run with the CUDA 11 version.  To unblock CI, marking the affected tests to be run without the sanitizer in the interim while this is being investigated.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/13904
---
 java/src/test/java/ai/rapids/cudf/ReductionTest.java | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
index cc172204ed3..8cc7df1ce7f 100644
--- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
@@ -18,6 +18,7 @@
 package ai.rapids.cudf;
 
 import com.google.common.collect.Lists;
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
@@ -430,6 +431,7 @@ private static void assertEqualsDelta(ReductionAggregation op, Scalar expected,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createBooleanParams")
   void testBoolean(ReductionAggregation op, Boolean[] values,
@@ -441,6 +443,7 @@ void testBoolean(ReductionAggregation op, Boolean[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createByteParams")
   void testByte(ReductionAggregation op, Byte[] values,
@@ -452,6 +455,7 @@ void testByte(ReductionAggregation op, Byte[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createShortParams")
   void testShort(ReductionAggregation op, Short[] values,
@@ -474,6 +478,7 @@ void testInt(ReductionAggregation op, Integer[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createLongParams")
   void testLong(ReductionAggregation op, Long[] values,
@@ -496,6 +501,7 @@ void testFloat(ReductionAggregation op, Float[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createDoubleParams")
   void testDouble(ReductionAggregation op, Double[] values,
@@ -507,6 +513,7 @@ void testDouble(ReductionAggregation op, Double[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampDaysParams")
   void testTimestampDays(ReductionAggregation op, Integer[] values,
@@ -518,6 +525,7 @@ void testTimestampDays(ReductionAggregation op, Integer[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampSecondsParams")
   void testTimestampSeconds(ReductionAggregation op, Long[] values,
@@ -529,6 +537,7 @@ void testTimestampSeconds(ReductionAggregation op, Long[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampMilliSecondsParams")
   void testTimestampMilliseconds(ReductionAggregation op, Long[] values,
@@ -540,6 +549,7 @@ void testTimestampMilliseconds(ReductionAggregation op, Long[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampMicroSecondsParams")
   void testTimestampMicroseconds(ReductionAggregation op, Long[] values,
@@ -551,6 +561,7 @@ void testTimestampMicroseconds(ReductionAggregation op, Long[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampNanoSecondsParams")
   void testTimestampNanoseconds(ReductionAggregation op, Long[] values,

From f2334228d3df079e5e771b5114ac114331c5a775 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 18 Aug 2023 10:44:34 -0700
Subject: [PATCH 073/230] Use `empty()` instead of `size()` where possible
 (#13908)

Adds `empty()` to `hostdevice_vector`.
Check `empty()` instead of checking the column/buffer/vector `size()`.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13908
---
 cpp/src/io/avro/reader_impl.cu                 |  2 +-
 cpp/src/io/csv/reader_impl.cu                  |  4 ++--
 cpp/src/io/json/json_column.cu                 |  2 +-
 cpp/src/io/json/legacy/reader_impl.cu          |  6 +++---
 cpp/src/io/json/nested_json_gpu.cu             |  4 ++--
 cpp/src/io/orc/reader_impl.cu                  |  6 +++---
 cpp/src/io/orc/writer_impl.cu                  |  2 +-
 cpp/src/io/parquet/compact_protocol_writer.cpp | 16 ++++++++--------
 cpp/src/io/parquet/reader_impl.cpp             |  4 ++--
 cpp/src/io/parquet/writer_impl.cu              | 16 ++++++++--------
 cpp/src/io/utilities/column_buffer.cpp         |  2 +-
 cpp/src/io/utilities/hostdevice_vector.hpp     |  1 +
 12 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index c25010c4e5f..f73e1db91c3 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -499,7 +499,7 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
 
   // Select only columns required by the options
   auto selected_columns = meta.select_columns(options.get_columns());
-  if (selected_columns.size() != 0) {
+  if (not selected_columns.empty()) {
     // Get a list of column data types
     std::vector<data_type> column_types;
     for (auto const& col : selected_columns) {
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 478cf2957bc..dc28380bf24 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -972,14 +972,14 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 
   // Handle user-defined true values, whereby field data is substituted with a
   // boolean true or numeric `1` value
-  if (reader_opts.get_true_values().size() != 0) {
+  if (not reader_opts.get_true_values().empty()) {
     parse_opts.trie_true =
       cudf::detail::create_serialized_trie(reader_opts.get_true_values(), stream);
   }
 
   // Handle user-defined false values, whereby field data is substituted with a
   // boolean false or numeric `0` value
-  if (reader_opts.get_false_values().size() != 0) {
+  if (not reader_opts.get_false_values().empty()) {
     parse_opts.trie_false =
       cudf::detail::create_serialized_trie(reader_opts.get_false_values(), stream);
   }
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 0cd8edaf78c..487a4bc4068 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -958,7 +958,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     options.is_enabled_lines() ? root_column : root_column.child_columns.begin()->second;
 
   // Zero row entries
-  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) {
+  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.empty()) {
     return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
   }
 
diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
index c524c041df7..1ae7ccf71c1 100644
--- a/cpp/src/io/json/legacy/reader_impl.cu
+++ b/cpp/src/io/json/legacy/reader_impl.cu
@@ -438,7 +438,7 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
         }},
       reader_opts.get_dtypes());
   } else {
-    CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
+    CUDF_EXPECTS(not rec_starts.empty(), "No data available for data type inference.\n");
     auto const num_columns       = column_names.size();
     auto const do_set_null_count = column_map->capacity() > 0;
 
@@ -612,7 +612,7 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     sources, reader_opts.get_compression(), range_offset, range_size, range_size_padded);
   host_span<char const> h_data{reinterpret_cast<char const*>(h_raw_data.data()), h_raw_data.size()};
 
-  CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
+  CUDF_EXPECTS(not h_data.empty(), "Ingest failed: uncompressed input data has zero size.\n");
 
   auto d_data = rmm::device_uvector<char>(0, stream);
 
@@ -629,7 +629,7 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     d_data = upload_data_to_device(reader_opts, h_data, rec_starts, stream);
   }
 
-  CUDF_EXPECTS(d_data.size() != 0, "Error uploading input data to the GPU.\n");
+  CUDF_EXPECTS(not d_data.is_empty(), "Error uploading input data to the GPU.\n");
 
   auto column_names_and_map =
     get_column_names_and_map(parse_opts.view(), h_data, rec_starts, d_data, stream);
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 8552db9a719..9a08b5f9353 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1647,7 +1647,7 @@ void make_json_column(json_column& root_column,
         CUDF_EXPECTS(current_data_path.top().column->child_columns.size() <= 1,
                      "Encountered a list column with more than a single child column");
         // The child column has yet to be created
-        if (current_data_path.top().column->child_columns.size() == 0) {
+        if (current_data_path.top().column->child_columns.empty()) {
           current_data_path.top().column->child_columns.emplace(std::string{list_child_name},
                                                                 json_column{json_col_t::Unknown});
           current_data_path.top().column->column_order.push_back(list_child_name);
@@ -2119,7 +2119,7 @@ table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
     new_line_delimited_json ? root_column : root_column.child_columns.begin()->second;
 
   // Zero row entries
-  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) {
+  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.empty()) {
     return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
   }
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 0f1b72d6126..157269cf52e 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -149,7 +149,7 @@ std::size_t gather_stream_info(std::size_t stripe_index,
       // for each of its fields. There is only a PRESENT stream, which
       // needs to be included for the reader.
       auto const schema_type = types[column_id];
-      if (schema_type.subtypes.size() != 0) {
+      if (not schema_type.subtypes.empty()) {
         if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
           for (auto const& idx : schema_type.subtypes) {
             auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
@@ -249,7 +249,7 @@ rmm::device_buffer decompress_stripe_data(
   // Required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
-  if (decomp_data.size() == 0) { return decomp_data; }
+  if (decomp_data.is_empty()) { return decomp_data; }
 
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
     num_compressed_blocks + num_uncompressed_blocks, stream);
@@ -1232,7 +1232,7 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
       CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
     }
 
-    if (stripe_data.size() == 0) { continue; }
+    if (stripe_data.empty()) { continue; }
 
     // Process dataset chunk pages into output columns
     auto row_groups =
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 881fc3b5caf..6a3c5f0134d 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2625,7 +2625,7 @@ void writer::impl::close()
                  });
 
   // Write statistics metadata
-  if (_orc_meta.stripeStats.size() != 0) {
+  if (not _orc_meta.stripeStats.empty()) {
     ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
     pbw.write(_orc_meta);
     add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer());
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 8aaff77d0dd..b2a89129645 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -31,8 +31,8 @@ size_t CompactProtocolWriter::write(FileMetaData const& f)
   c.field_struct_list(2, f.schema);
   c.field_int(3, f.num_rows);
   c.field_struct_list(4, f.row_groups);
-  if (f.key_value_metadata.size() != 0) { c.field_struct_list(5, f.key_value_metadata); }
-  if (f.created_by.size() != 0) { c.field_string(6, f.created_by); }
+  if (not f.key_value_metadata.empty()) { c.field_struct_list(5, f.key_value_metadata); }
+  if (not f.created_by.empty()) { c.field_string(6, f.created_by); }
   if (f.column_order_listsize != 0) {
     // Dummy list of struct containing an empty field1 struct
     c.put_field_header(7, c.current_field(), ST_FLD_LIST);
@@ -167,14 +167,14 @@ size_t CompactProtocolWriter::write(KeyValue const& k)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_string(1, k.key);
-  if (k.value.size() != 0) { c.field_string(2, k.value); }
+  if (not k.value.empty()) { c.field_string(2, k.value); }
   return c.value();
 }
 
 size_t CompactProtocolWriter::write(ColumnChunk const& s)
 {
   CompactProtocolFieldWriter c(*this);
-  if (s.file_path.size() != 0) { c.field_string(1, s.file_path); }
+  if (not s.file_path.empty()) { c.field_string(1, s.file_path); }
   c.field_int(2, s.file_offset);
   c.field_struct(3, s.meta_data);
   if (s.offset_index_length != 0) {
@@ -208,12 +208,12 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
 size_t CompactProtocolWriter::write(Statistics const& s)
 {
   CompactProtocolFieldWriter c(*this);
-  if (s.max.size() != 0) { c.field_binary(1, s.max); }
-  if (s.min.size() != 0) { c.field_binary(2, s.min); }
+  if (not s.max.empty()) { c.field_binary(1, s.max); }
+  if (not s.min.empty()) { c.field_binary(2, s.min); }
   if (s.null_count != -1) { c.field_int(3, s.null_count); }
   if (s.distinct_count != -1) { c.field_int(4, s.distinct_count); }
-  if (s.max_value.size() != 0) { c.field_binary(5, s.max_value); }
-  if (s.min_value.size() != 0) { c.field_binary(6, s.min_value); }
+  if (not s.max_value.empty()) { c.field_binary(5, s.max_value); }
+  if (not s.min_value.empty()) { c.field_binary(6, s.min_value); }
   return c.value();
 }
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 88e520c99a4..5a44eb6baa0 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -328,7 +328,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
   auto const [skip_rows_corrected, num_rows_corrected, row_groups_info] =
     _metadata->select_row_groups(row_group_indices, skip_rows, num_rows, output_types, filter);
 
-  if (num_rows_corrected > 0 && row_groups_info.size() != 0 && _input_columns.size() != 0) {
+  if (num_rows_corrected > 0 && not row_groups_info.empty() && not _input_columns.empty()) {
     load_and_decompress_data(row_groups_info, num_rows_corrected);
     preprocess_pages(
       skip_rows_corrected, num_rows_corrected, uses_custom_row_bounds, _chunk_read_limit);
@@ -368,7 +368,7 @@ table_with_metadata reader::impl::read_chunk_internal(
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
-  if (!has_next() || _chunk_read_info.size() == 0) {
+  if (!has_next() || _chunk_read_info.empty()) {
     return finalize_output(out_metadata, out_columns, filter);
   }
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 021b6cffa5a..c5fc852d20b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -202,7 +202,7 @@ parquet::Compression to_parquet_compression(compression_type compression)
  */
 size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
 {
-  if (column.size() == 0) { return 0; }
+  if (column.is_empty()) { return 0; }
 
   if (is_fixed_width(column.type())) {
     return size_of(column.type()) * column.size();
@@ -573,7 +573,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0,
                      "Binary column's corresponding metadata should have zero or two children!");
         if (col_meta.num_children() > 0) {
-          CUDF_EXPECTS(col->children[lists_column_view::child_column_index]->children.size() == 0,
+          CUDF_EXPECTS(col->children[lists_column_view::child_column_index]->children.empty(),
                        "Binary column must not be nested!");
         }
 
@@ -859,7 +859,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
 
   _is_list = (_max_rep_level > 0);
 
-  if (cudf_col.size() == 0) { return; }
+  if (cudf_col.is_empty()) { return; }
 
   if (_is_list) {
     // Top level column's offsets are not applied to all children. Get the effective offset and
@@ -1103,7 +1103,7 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   std::vector<rmm::device_uvector<size_type>> dict_data;
   std::vector<rmm::device_uvector<size_type>> dict_index;
 
-  if (h_chunks.size() == 0) { return std::pair(std::move(dict_data), std::move(dict_index)); }
+  if (h_chunks.empty()) { return std::pair(std::move(dict_data), std::move(dict_index)); }
 
   if (dict_policy == dictionary_policy::NEVER) {
     thrust::for_each(
@@ -2369,11 +2369,11 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
     }
   }
   // Reader doesn't currently populate column_order, so infer it here
-  if (md.row_groups.size() != 0) {
+  if (not md.row_groups.empty()) {
     auto const is_valid_stats = [](auto const& stats) {
-      return stats.max.size() != 0 || stats.min.size() != 0 || stats.null_count != -1 ||
-             stats.distinct_count != -1 || stats.max_value.size() != 0 ||
-             stats.min_value.size() != 0;
+      return not stats.max.empty() || not stats.min.empty() || stats.null_count != -1 ||
+             stats.distinct_count != -1 || not stats.max_value.empty() ||
+             not stats.min_value.empty();
     };
 
     uint32_t num_columns = static_cast<uint32_t>(md.row_groups[0].columns.size());
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 9b8754d6318..3248d94d60a 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -68,7 +68,7 @@ std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
   auto str_col =
-    _string_data.size() == 0
+    _string_data.is_empty()
       ? make_empty_column(data_type{type_id::INT8})
       : std::make_unique<column>(data_type{type_id::INT8},
                                  string_size(),
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index c8655cdcc7c..a6a93c41472 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -94,6 +94,7 @@ class hostdevice_vector {
   [[nodiscard]] size_t capacity() const noexcept { return d_data.size(); }
   [[nodiscard]] size_t size() const noexcept { return current_size; }
   [[nodiscard]] size_t size_bytes() const noexcept { return sizeof(T) * size(); }
+  [[nodiscard]] bool empty() const noexcept { return size() == 0; }
 
   [[nodiscard]] T& operator[](size_t i) { return host_data[i]; }
   [[nodiscard]] T const& operator[](size_t i) const { return host_data[i]; }

From b798a70d608cbbe2c7f372a8c21354455ba56f74 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 18 Aug 2023 11:08:24 -0700
Subject: [PATCH 074/230] Remove the internal use of the cudf's default stream
 in cuIO (#13903)

`cudf::get_default_stream()` is used as the default stream when calling `detail` APIs. From this point, the stream passed to this `detail` implementation should be propagated and used consistently. The exceptions are the parts of the implementation that use a separate stream pool.
All in all, libcudf code should not use `cudf::get_default_stream()` outside of dispatching the detail APIs. This PR removes most of such uses in cuIO.
One notable exception is `datasource::host_read`, as it simply does not have a stream parameter (and adding it would be a breaking change).
Also removed comments that mention default stream value that no longer exists.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13903
---
 cpp/src/io/avro/avro_gpu.cu                  |  2 +-
 cpp/src/io/csv/csv_gpu.hpp                   |  4 ++--
 cpp/src/io/fst/logical_stack.cuh             |  2 +-
 cpp/src/io/json/json_column.cu               | 12 ++++++------
 cpp/src/io/json/nested_json_gpu.cu           | 14 +++++++++-----
 cpp/src/io/parquet/parquet_gpu.hpp           | 14 +++++++-------
 cpp/src/io/parquet/predicate_pushdown.cpp    |  6 +++---
 cpp/src/io/parquet/reader_impl.cpp           |  3 ++-
 cpp/src/io/parquet/reader_impl_helpers.cpp   |  5 +++--
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  9 ++++++---
 cpp/src/io/parquet/reader_impl_preprocess.cu |  2 +-
 11 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 3a663fca041..2c634d9b590 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -419,7 +419,7 @@ __global__ void __launch_bounds__(num_warps * 32, 2)
  * @param[in] avro_data Raw block data
  * @param[in] schema_len Number of entries in schema
  * @param[in] min_row_size Minimum size in bytes of a row
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodeAvroColumnData(device_span<block_desc_s const> blocks,
                           schemadesc_s* schema,
diff --git a/cpp/src/io/csv/csv_gpu.hpp b/cpp/src/io/csv/csv_gpu.hpp
index 4b51368f14d..62bd8f1eff2 100644
--- a/cpp/src/io/csv/csv_gpu.hpp
+++ b/cpp/src/io/csv/csv_gpu.hpp
@@ -195,7 +195,7 @@ device_span<uint64_t> remove_blank_rows(cudf::io::parse_options_view const& opti
  * @param[in] data The row-column data
  * @param[in] column_flags Flags that control individual column parsing
  * @param[in] row_offsets List of row data start positions (offsets)
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  *
  * @return stats Histogram of each dtypes' occurrence for each column
  */
@@ -218,7 +218,7 @@ std::vector<column_type_histogram> detect_column_types(
  * @param[out] columns Device memory output of column data
  * @param[out] valids Device memory output of column valids bitmap data
  * @param[out] valid_counts Device memory output of the number of valid fields in each column
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void decode_row_column_data(cudf::io::parse_options_view const& options,
                             device_span<char const> data,
diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
index a5d32cba125..c4f99736306 100644
--- a/cpp/src/io/fst/logical_stack.cuh
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -274,7 +274,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
                                      StackSymbolT const empty_stack_symbol,
                                      StackSymbolT const read_symbol,
                                      std::size_t const num_symbols_out,
-                                     rmm::cuda_stream_view stream = cudf::get_default_stream())
+                                     rmm::cuda_stream_view stream)
 {
   rmm::device_buffer temp_storage{};
 
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 487a4bc4068..bdad16bd9f1 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -355,16 +355,15 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
                                    options_view,
                                    stream,
                                    rmm::mr::get_current_device_resource());
-  auto to_host            = [](auto const& col) {
+  auto to_host            = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
     auto const h_chars = cudf::detail::make_std_vector_sync<char>(
-      cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
-      cudf::get_default_stream());
+      cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()), stream);
     auto const h_offsets = cudf::detail::make_std_vector_sync(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
-      cudf::get_default_stream());
+      stream);
 
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
@@ -719,7 +718,8 @@ void make_device_json_column(device_span<SymbolT const> input,
  * @param options The reader options to influence the relevant type inference and type casting
  * options
  */
-cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options);
+cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
+                                        rmm::cuda_stream_view stream);
 
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
@@ -976,7 +976,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   // Initialize meta data to be populated while recursing through the tree of columns
   std::vector<std::unique_ptr<column>> out_columns;
   std::vector<column_name_info> out_column_names;
-  auto parse_opt = parsing_options(options);
+  auto parse_opt = parsing_options(options, stream);
 
   // Iterate over the struct's child columns and convert to cudf column
   size_type column_index = 0;
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 9a08b5f9353..b691eaa8caf 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1889,12 +1889,12 @@ void make_json_column(json_column& root_column,
  *
  * @param options The reader options to influence the relevant type inference and type casting
  * options
+ * @param stream The CUDA stream to which kernels are dispatched
  */
-auto parsing_options(cudf::io::json_reader_options const& options)
+auto parsing_options(cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream)
 {
   auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};
 
-  auto const stream     = cudf::get_default_stream();
   parse_opts.dayfirst   = options.is_enabled_dayfirst();
   parse_opts.keepquotes = options.is_enabled_keep_quotes();
   parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -1975,8 +1975,12 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       }
       // Infer column type, if we don't have an explicit type for it
       else {
-        target_type = cudf::io::detail::infer_data_type(
-          parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream);
+        target_type =
+          cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(),
+                                            d_input,
+                                            string_ranges_it,
+                                            col_size,
+                                            stream);
       }
 
       auto [result_bitmask, null_count] = make_validity(json_col);
@@ -1987,7 +1991,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
                             target_type,
                             std::move(result_bitmask),
                             null_count,
-                            parsing_options(options).view(),
+                            parsing_options(options, stream).view(),
                             stream,
                             mr);
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index fc4ad026b61..b7a8f4e2157 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -442,7 +442,7 @@ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
  *
  * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
 
@@ -452,7 +452,7 @@ void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_st
  *
  * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
@@ -480,7 +480,7 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
  * @param compute_string_sizes If set to true, the str_bytes field in PageInfo will
  * be computed
  * @param level_type_size Size in bytes of the type for level decoding
- * @param stream CUDA stream to use, default 0
+ * @param stream CUDA stream to use
  */
 void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                       cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -504,7 +504,7 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] min_rows crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -524,7 +524,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                     cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -544,7 +544,7 @@ void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                           cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -654,7 +654,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] write_v2_headers True if V2 page headers should be written
  * @param[in] chunk_grstats Setup for chunk-level stats
  * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
                       device_span<gpu::EncPage> pages,
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 53ebd4900eb..805d082c71e 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -375,10 +375,10 @@ class stats_expression_converter : public ast::detail::expression_transformer {
 std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   host_span<data_type const> output_dtypes,
-  std::reference_wrapper<ast::expression const> filter) const
+  std::reference_wrapper<ast::expression const> filter,
+  rmm::cuda_stream_view stream) const
 {
-  auto stream = cudf::get_default_stream();
-  auto mr     = rmm::mr::get_current_device_resource();
+  auto mr = rmm::mr::get_current_device_resource();
   // Create row group indices.
   std::vector<std::vector<size_type>> filtered_row_group_indices;
   std::vector<std::vector<size_type>> all_row_group_indices;
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 5a44eb6baa0..b9f3639da79 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -326,7 +326,8 @@ void reader::impl::prepare_data(int64_t skip_rows,
                    [](auto const& col) { return col.type; });
   }
   auto const [skip_rows_corrected, num_rows_corrected, row_groups_info] =
-    _metadata->select_row_groups(row_group_indices, skip_rows, num_rows, output_types, filter);
+    _metadata->select_row_groups(
+      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
 
   if (num_rows_corrected > 0 && not row_groups_info.empty() && not _input_columns.empty()) {
     load_and_decompress_data(row_groups_info, num_rows_corrected);
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 9444ffbcf02..f6dbeb275fc 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -350,12 +350,13 @@ aggregate_reader_metadata::select_row_groups(
   int64_t skip_rows_opt,
   std::optional<size_type> const& num_rows_opt,
   host_span<data_type const> output_dtypes,
-  std::optional<std::reference_wrapper<ast::expression const>> filter) const
+  std::optional<std::reference_wrapper<ast::expression const>> filter,
+  rmm::cuda_stream_view stream) const
 {
   std::optional<std::vector<std::vector<size_type>>> filtered_row_group_indices;
   if (filter.has_value()) {
     filtered_row_group_indices =
-      filter_row_groups(row_group_indices, output_dtypes, filter.value());
+      filter_row_groups(row_group_indices, output_dtypes, filter.value(), stream);
     if (filtered_row_group_indices.has_value()) {
       row_group_indices =
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 1dbcacf0a94..751ffc33123 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -170,12 +170,14 @@ class aggregate_reader_metadata {
    * @param row_group_indices Lists of row groups to read, one per source
    * @param output_dtypes List of output column datatypes
    * @param filter AST expression to filter row groups based on Column chunk statistics
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Filtered row group indices, if any is filtered.
    */
   [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     host_span<data_type const> output_dtypes,
-    std::reference_wrapper<ast::expression const> filter) const;
+    std::reference_wrapper<ast::expression const> filter,
+    rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters and reduces down to a selection of row groups
@@ -188,7 +190,7 @@ class aggregate_reader_metadata {
    * @param row_count Total number of rows selected
    * @param output_dtypes List of output column datatypes
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
-   *
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
@@ -197,7 +199,8 @@ class aggregate_reader_metadata {
     int64_t row_start,
     std::optional<size_type> const& row_count,
     host_span<data_type const> output_dtypes,
-    std::optional<std::reference_wrapper<ast::expression const>> filter) const;
+    std::optional<std::reference_wrapper<ast::expression const>> filter,
+    rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters and reduces down to a selection of columns
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 1ea89f5f694..7cdccf0b273 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1178,7 +1178,7 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
  * @param id Additional intermediate information required to process the pages
  * @param num_rows Total number of rows to read
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- * @param stream CUDA stream to use, default 0
+ * @param stream CUDA stream to use
  */
 std::vector<gpu::chunk_read_info> compute_splits(
   cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,

From 263a85d70edbf08232beb3286c1a2d0f08afe76e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 18 Aug 2023 17:49:38 -0500
Subject: [PATCH 075/230] Preserve index `name` in `reindex` (#13917)

Fixes: #13900

This PR fixes an issue with `reindex` API, where `name` of the index being reindexed upon was lost. This PR fixes it to match pandas by using the new index name if it exists or preserving the old name.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13917
---
 python/cudf/cudf/core/indexed_frame.py   | 27 +++++++++++++++++++++--
 python/cudf/cudf/tests/test_dataframe.py | 28 ++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 51a2d085d00..8e6cdbb2787 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2582,10 +2582,12 @@ def _reindex(
 
         df = self
         if index is not None:
-            index = cudf.core.index.as_index(index)
+            index = cudf.core.index.as_index(
+                index, name=getattr(index, "name", self._index.name)
+            )
 
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
-                left_dtype == right_dtype
+                _is_same_dtype(left_dtype, right_dtype)
                 for left_dtype, right_dtype in zip(
                     (col.dtype for col in df.index._data.columns),
                     (col.dtype for col in index._data.columns),
@@ -5405,3 +5407,24 @@ def _drop_rows_by_labels(
             res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"]
             res.name = obj.name
             return res
+
+
+def _is_same_dtype(lhs_dtype, rhs_dtype):
+    # Utility specific to `_reindex` to check
+    # for matching column dtype.
+    if lhs_dtype == rhs_dtype:
+        return True
+    elif (
+        is_categorical_dtype(lhs_dtype)
+        and not is_categorical_dtype(rhs_dtype)
+        and lhs_dtype.categories.dtype == rhs_dtype
+    ):
+        return True
+    elif (
+        is_categorical_dtype(rhs_dtype)
+        and not is_categorical_dtype(lhs_dtype)
+        and rhs_dtype.categories.dtype == lhs_dtype
+    ):
+        return True
+    else:
+        return False
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2e1e20dee40..0501874ecda 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10288,3 +10288,31 @@ def test_dataframe_mixed_dtype_error(dtype):
     pdf = pd.Series([1, 2, 3], dtype=dtype).to_frame().astype(object)
     with pytest.raises(TypeError):
         cudf.from_pandas(pdf)
+
+
+@pytest.mark.parametrize(
+    "index_data,name",
+    [([10, 13], "a"), ([30, 40, 20], "b"), (["ef"], "c"), ([2, 3], "Z")],
+)
+def test_dataframe_reindex_with_index_names(index_data, name):
+    gdf = cudf.DataFrame(
+        {
+            "a": [10, 12, 13],
+            "b": [20, 30, 40],
+            "c": cudf.Series(["ab", "cd", "ef"], dtype="category"),
+        }
+    )
+    if name in gdf.columns:
+        gdf = gdf.set_index(name)
+    pdf = gdf.to_pandas()
+
+    gidx = cudf.Index(index_data, name=name)
+    actual = gdf.reindex(gidx)
+    expected = pdf.reindex(gidx.to_pandas())
+
+    assert_eq(actual, expected)
+
+    actual = gdf.reindex(index_data)
+    expected = pdf.reindex(index_data)
+
+    assert_eq(actual, expected)

From 5eee8ac988686dd8e0cc8328194055aa7579d9b6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 21 Aug 2023 08:05:53 -0500
Subject: [PATCH 076/230] Return `nan` when one variable to be correlated has
 zero variance in JIT GroupBy Apply (#13884)

Closes https://github.com/rapidsai/cudf/issues/13875

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13884
---
 python/cudf/cudf/tests/test_groupby.py | 15 +++++++++++++++
 python/cudf/udf_cpp/shim.cu            |  3 +--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index b01b44da201..e578e1061ca 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -449,6 +449,21 @@ def func(group):
     run_groupby_apply_jit_test(groupby_jit_data, func, keys)
 
 
+@pytest.mark.parametrize("dtype", ["int32", "int64"])
+def test_groupby_apply_jit_correlation_zero_variance(dtype):
+    # pearson correlation is undefined when the variance of either
+    # variable is zero. This test ensures that the jit implementation
+    # returns the same result as pandas in this case.
+    data = DataFrame(
+        {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]}
+    )
+
+    def func(group):
+        return group["b"].corr(group["c"])
+
+    run_groupby_apply_jit_test(data, func, ["a"])
+
+
 @pytest.mark.parametrize("dtype", ["float64"])
 @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu
index 0959b6ba53f..686e39e7036 100644
--- a/python/cudf/udf_cpp/shim.cu
+++ b/python/cudf/udf_cpp/shim.cu
@@ -643,9 +643,8 @@ __device__ double BlockCorr(T* const lhs_ptr, T* const rhs_ptr, int64_t size)
 {
   auto numerator   = BlockCoVar(lhs_ptr, rhs_ptr, size);
   auto denominator = BlockStd(lhs_ptr, size) * BlockStd<T>(rhs_ptr, size);
-
   if (denominator == 0.0) {
-    return 0.0;
+    return std::numeric_limits<double>::quiet_NaN();
   } else {
     return numerator / denominator;
   }

From 55a4ecf14d43dae92254805358e09d6a60010fc9 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Mon, 21 Aug 2023 10:00:50 -0700
Subject: [PATCH 077/230] Translate column size overflow exception to JNI
 (#13911)

When a CUDF operation causes a column's row count to exceed the size limit imposed by `cudf::size_type`, the operation throws a `std::overflow_error` exception. However, prior to this commit, CUDF JNI did not translate this to a separate Java exception. Because of handling this condition as any generic exception, there was no way to attempt case specific recovery for overflow conditions.

This commit translates `std::overflow_error` into a new Java exception (`CudfColumnOverflowException`) that may be caught in user space to attempt recovery/retry.

This is a non-breaking change. The user-facing change is minimal in that existing failure handling based on catching `CudfException` will continue to work as before. The user will now have more fine grained error handling by catching `CudfColumnOverflowException`.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13911
---
 .../cudf/CudfColumnSizeOverflowException.java | 34 ++++++++++
 java/src/main/native/include/jni_utils.hpp    |  5 ++
 .../java/ai/rapids/cudf/LargeTableTest.java   | 66 +++++++++++++++++++
 3 files changed, 105 insertions(+)
 create mode 100755 java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java
 create mode 100644 java/src/test/java/ai/rapids/cudf/LargeTableTest.java

diff --git a/java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java b/java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java
new file mode 100755
index 00000000000..9e724907a3c
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.rapids.cudf;
+
+/**
+ * Exception thrown when CUDF operation results in a column size
+ * exceeding CUDF column size limits
+ */
+public class CudfColumnSizeOverflowException extends CudfException {
+  CudfColumnSizeOverflowException(String message) {
+    super(message);
+  }
+
+  CudfColumnSizeOverflowException(String message, String nativeStacktrace) {
+    super(message, nativeStacktrace);
+  }
+
+  CudfColumnSizeOverflowException(String message, String nativeStacktrace, Throwable cause) {
+    super(message, nativeStacktrace, cause);
+  }
+}
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index 07d4c1a9c34..ff4da893329 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -32,6 +32,7 @@ constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6;
 constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException";
 constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException";
 constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException";
+constexpr char const *CUDF_OVERFLOW_ERROR_CLASS = "ai/rapids/cudf/CudfColumnSizeOverflowException";
 constexpr char const *CUDF_DTYPE_ERROR_CLASS = "ai/rapids/cudf/CudfException";
 constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException";
 constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
@@ -901,6 +902,10 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
     JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(),               \
                                    e.stacktrace(), ret_val);                                       \
   }                                                                                                \
+  catch (std::overflow_error const &e) {                                                           \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_OVERFLOW_ERROR_CLASS, e.what(),            \
+                                   "No native stacktrace is available.", ret_val);                 \
+  }                                                                                                \
   catch (const std::exception &e) {                                                                \
     char const *stacktrace = "No native stacktrace is available.";                                 \
     if (auto const cudf_ex = dynamic_cast<cudf::logic_error const *>(&e); cudf_ex != nullptr) {    \
diff --git a/java/src/test/java/ai/rapids/cudf/LargeTableTest.java b/java/src/test/java/ai/rapids/cudf/LargeTableTest.java
new file mode 100644
index 00000000000..d5e0942dfdd
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/LargeTableTest.java
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package ai.rapids.cudf;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/**
+ * Test for operations on tables with large row counts.
+ */
+public class LargeTableTest extends CudfTestBase {
+
+  static final long RMM_POOL_SIZE_LARGE = 10L * 1024 * 1024 * 1024;
+
+  public LargeTableTest() {
+    // Set large RMM pool size. Ensure that the test does not run out of memory,
+    // for large row counts.
+    super(RmmAllocationMode.POOL, RMM_POOL_SIZE_LARGE);
+  }
+
+  /**
+   * Tests that exploding large array columns will result in CudfColumnOverflowException
+   * if the column size limit is crossed.
+   */
+  @Test
+  public void testExplodeOverflow() {
+    int numRows = 1000_000;
+    int arraySize = 1000;
+    String str = "abc";
+
+    // 1 Million rows, each row being { "abc", [ 0, 0, 0... ] },
+    // with 1000 elements in the array in each row.
+    // When the second column is exploded, it produces 1 Billion rows.
+    // The string row is repeated once for each element in the array,
+    // thus producing a 1 Billion row string column, with 3 Billion chars
+    // in the child column. This should cause an overflow exception.
+    boolean [] arrBools = new boolean[arraySize];
+    for (char i = 0; i < arraySize; ++i) { arrBools[i] = false; }
+    Exception exception = assertThrows(CudfColumnSizeOverflowException.class, ()->{
+        try (Scalar strScalar = Scalar.fromString(str);
+             ColumnVector arrRow = ColumnVector.fromBooleans(arrBools);
+             Scalar arrScalar = Scalar.listFromColumnView(arrRow);
+             ColumnVector strVector = ColumnVector.fromScalar(strScalar, numRows);
+             ColumnVector arrVector = ColumnVector.fromScalar(arrScalar, numRows);
+             Table inputTable = new Table(strVector, arrVector);
+             Table outputTable = inputTable.explode(1)) {
+          assertEquals(outputTable.getColumns()[0].getRowCount(), numRows * arraySize);
+          fail("Exploding this large table should have caused a CudfColumnSizeOverflowException.");
+        }});
+    assertTrue(exception.getMessage().contains("Size of output exceeds the column size limit"));
+  }
+}

From c2f216796d18d7b4b0beb4d8f51e219ed10195d1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 21 Aug 2023 15:36:53 -0500
Subject: [PATCH 078/230] Ensure cudf internals use pylibcudf in pure Python
 mode (#13909)

In the short term, cudf internals will be the primary client of pylibcudf, and in the intermediate term cudf internals will likely remain the most exhaustive test of pylibcudf's API. While pylibcudf is designed to be used as either a Python or a Cython library, Python usage is expected to be the much more common entry point. Therefore, it would be best for cudf to use pylibcudf solely as a Python library. This is also useful because it would theoretically allow us to turn cudf into a pure Python library, or at least something much closer to it. In the long run we will likely want to leverage pylibcudf's Cython to accelerate parts of cudf, but that isn't something we should aim for the in the first pass. Switching to pure Python mode is also important because of certain limitations with Cython scoped enum support that will hopefully be fixed soon in Cython itself.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/13909
---
 python/cudf/cudf/_lib/column.pxd              |  3 +-
 python/cudf/cudf/_lib/column.pyx              | 31 ++++---
 python/cudf/cudf/_lib/copying.pyx             |  4 +-
 python/cudf/cudf/_lib/pylibcudf/column.pxd    | 34 +++++--
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 90 +++++++++++++++----
 .../cudf/_lib/pylibcudf/gpumemoryview.pxd     |  7 +-
 python/cudf/cudf/_lib/pylibcudf/table.pxd     |  4 +-
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |  9 +-
 python/cudf/cudf/_lib/types.pxd               |  3 +-
 python/cudf/cudf/_lib/types.pyx               | 59 +++++++++++-
 python/cudf/cudf/_lib/utils.pxd               |  3 +-
 python/cudf/cudf/_lib/utils.pyx               |  6 +-
 12 files changed, 194 insertions(+), 59 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index fbdf6288538..7ffb55a6cc6 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -7,7 +7,6 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib cimport pylibcudf
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
 from cudf._lib.cpp.types cimport size_type
@@ -30,7 +29,7 @@ cdef class Column:
     cdef column_view _view(self, size_type null_count) except *
     cdef column_view view(self) except *
     cdef mutable_column_view mutable_view(self) except *
-    cpdef pylibcudf.Column to_pylibcudf(self, mode: Literal["read", "write"])
+    cpdef to_pylibcudf(self, mode: Literal["read", "write"])
 
     @staticmethod
     cdef Column from_unique_ptr(
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 2b1fc14f398..4db3761b1b8 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -11,9 +11,6 @@ import rmm
 import cudf
 import cudf._lib as libcudf
 from cudf._lib import pylibcudf
-
-from cudf._lib cimport pylibcudf
-
 from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
 from cudf.core.buffer import (
     Buffer,
@@ -40,6 +37,7 @@ from cudf._lib.types cimport (
 )
 
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
+from cudf._lib.types import dtype_from_pylibcudf_column
 
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.unary as libcudf_unary
@@ -446,7 +444,7 @@ cdef class Column:
     # underlying buffers as exposed before this function can itself be exposed
     # publicly.  User requests to convert to pylibcudf must assume that the
     # data may be modified afterwards.
-    cpdef pylibcudf.Column to_pylibcudf(self, mode: Literal["read", "write"]):
+    cpdef to_pylibcudf(self, mode: Literal["read", "write"]):
         """Convert this Column to a pylibcudf.Column.
 
         This function will generate a pylibcudf Column pointing to the same
@@ -476,9 +474,9 @@ cdef class Column:
         else:
             col = self
 
-        cdef pylibcudf.DataType dtype = dtype_to_pylibcudf_type(col.dtype)
+        dtype = dtype_to_pylibcudf_type(col.dtype)
 
-        cdef pylibcudf.gpumemoryview data = None
+        data = None
         if col.base_data is not None:
             cai = cuda_array_interface_wrapper(
                 ptr=col.base_data.get_ptr(mode=mode),
@@ -487,7 +485,7 @@ cdef class Column:
             )
             data = pylibcudf.gpumemoryview(cai)
 
-        cdef pylibcudf.gpumemoryview mask = None
+        mask = None
         if self.nullable:
             # TODO: Are we intentionally use self's mask instead of col's?
             # Where is the mask stored for categoricals?
@@ -586,7 +584,7 @@ cdef class Column:
     #  TODO: Actually support exposed data pointers.
     @staticmethod
     def from_pylibcudf(
-        pylibcudf.Column col, bint data_ptr_exposed=False
+        col, bint data_ptr_exposed=False
     ):
         """Create a Column from a pylibcudf.Column.
 
@@ -607,19 +605,20 @@ cdef class Column:
         pylibcudf.Column
             A new pylibcudf.Column referencing the same data.
         """
-        # TODO: Rewrite utility for dtype conversion to not need a column view.
-        dtype = dtype_from_column_view(col.view())
+        dtype = dtype_from_pylibcudf_column(col)
 
         return cudf.core.column.build_column(
-            data=as_buffer(col.data.obj) if col.data is not None else None,
+            data=as_buffer(col.data().obj) if col.data() is not None else None,
             dtype=dtype,
-            size=col.size,
-            mask=as_buffer(col.mask.obj) if col.mask is not None else None,
-            offset=col.offset,
-            null_count=col.null_count,
+            size=col.size(),
+            mask=as_buffer(
+                col.null_mask().obj
+            ) if col.null_mask() is not None else None,
+            offset=col.offset(),
+            null_count=col.null_count(),
             children=tuple([
                 Column.from_pylibcudf(child)
-                for child in col.children
+                for child in col.children()
             ])
         )
 
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 944a80158df..f57bc15ed57 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -11,9 +11,9 @@ from libcpp.vector cimport vector
 from rmm._lib.device_buffer cimport DeviceBuffer
 
 import cudf
+from cudf._lib import pylibcudf
 from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer
 
-from cudf._lib cimport pylibcudf
 from cudf._lib.column cimport Column
 
 from cudf._lib.scalar import as_device_scalar
@@ -174,7 +174,7 @@ def gather(
     Column gather_map,
     bool nullify=False
 ):
-    cdef pylibcudf.Table tbl = pylibcudf.copying.gather(
+    tbl = pylibcudf.copying.gather(
         pylibcudf.Table([col.to_pylibcudf(mode="read") for col in columns]),
         gather_map.to_pylibcudf(mode="read"),
         pylibcudf.copying.OutOfBoundsPolicy.NULLIFY if nullify
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index 2b08e6863a1..2af87db5b03 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -15,16 +15,36 @@ cdef class Column:
     # TODO: Should we document these attributes? Should we mark them readonly?
     cdef:
         # Core data
-        DataType data_type
-        size_type size
-        gpumemoryview data
-        gpumemoryview mask
-        size_type null_count
-        size_type offset
+        DataType _data_type
+        size_type _size
+        gpumemoryview _data
+        gpumemoryview _mask
+        size_type _null_count
+        size_type _offset
         # children: List[Column]
-        list children
+        list _children
+        size_type _num_children
 
     cdef column_view view(self) nogil
 
     @staticmethod
     cdef Column from_libcudf(unique_ptr[column] libcudf_col)
+
+    cpdef DataType type(self)
+    cpdef Column child(self, size_type index)
+    cpdef size_type num_children(self)
+    cpdef size_type size(self)
+    cpdef size_type null_count(self)
+    cpdef size_type offset(self)
+    cpdef gpumemoryview data(self)
+    cpdef gpumemoryview null_mask(self)
+    cpdef list children(self)
+
+    cpdef list_view(self)
+
+
+cdef class ListColumnView:
+    """Accessor for methods of a Column that are specific to lists."""
+    cdef Column _column
+    cpdef child(self)
+    cpdef offsets(self)
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index be4eff4c49d..d9b2ca98ead 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -9,7 +9,7 @@ from cudf._lib.cpp.column.column cimport column, column_contents
 from cudf._lib.cpp.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
-from .types cimport DataType
+from .types cimport DataType, TypeId
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
 
@@ -45,13 +45,14 @@ cdef class Column:
         gpumemoryview mask, size_type null_count, size_type offset,
         list children
     ):
-        self.data_type = data_type
-        self.size = size
-        self.data = data
-        self.mask = mask
-        self.null_count = null_count
-        self.offset = offset
-        self.children = children
+        self._data_type = data_type
+        self._size = size
+        self._data = data
+        self._mask = mask
+        self._null_count = null_count
+        self._offset = offset
+        self._children = children
+        self._num_children = len(children)
 
     cdef column_view view(self) nogil:
         """Generate a libcudf column_view to pass to libcudf algorithms.
@@ -63,17 +64,17 @@ cdef class Column:
         cdef const void * data = NULL
         cdef const bitmask_type * null_mask = NULL
 
-        if self.data is not None:
-            data = int_to_void_ptr(self.data.ptr)
-        if self.mask is not None:
-            null_mask = int_to_bitmask_ptr(self.mask.ptr)
+        if self._data is not None:
+            data = int_to_void_ptr(self._data.ptr)
+        if self._mask is not None:
+            null_mask = int_to_bitmask_ptr(self._mask.ptr)
 
         # TODO: Check if children can ever change. If not, this could be
         # computed once in the constructor and always be reused.
         cdef vector[column_view] c_children
         with gil:
-            if self.children is not None:
-                for child in self.children:
+            if self._children is not None:
+                for child in self._children:
                     # Need to cast to Column here so that Cython knows that
                     # `view` returns a typed object, not a Python object. We
                     # cannot use a typed variable for `child` because cdef
@@ -86,8 +87,8 @@ cdef class Column:
                     c_children.push_back((<Column> child).view())
 
         return column_view(
-            self.data_type.c_obj, self.size, data, null_mask,
-            self.null_count, self.offset, c_children
+            self._data_type.c_obj, self._size, data, null_mask,
+            self._null_count, self._offset, c_children
         )
 
     @staticmethod
@@ -133,3 +134,60 @@ cdef class Column:
             0,
             children,
         )
+
+    cpdef DataType type(self):
+        """The type of data in the column."""
+        return self._data_type
+
+    cpdef Column child(self, size_type index):
+        """Get a child column of this column.
+
+        Parameters
+        ----------
+        index : size_type
+            The index of the child column to get.
+
+        Returns
+        -------
+        Column
+            The child column.
+        """
+        return self._children[index]
+
+    cpdef size_type num_children(self):
+        """The number of children of this column."""
+        return self._num_children
+
+    cpdef list_view(self):
+        return ListColumnView(self)
+
+    cpdef gpumemoryview data(self):
+        return self._data
+
+    cpdef gpumemoryview null_mask(self):
+        return self._mask
+
+    cpdef size_type size(self):
+        return self._size
+
+    cpdef size_type offset(self):
+        return self._offset
+
+    cpdef size_type null_count(self):
+        return self._null_count
+
+    cpdef list children(self):
+        return self._children
+
+cdef class ListColumnView:
+    """Accessor for methods of a Column that are specific to lists."""
+    def __init__(self, Column col):
+        if col.type().id() != TypeId.LIST:
+            raise TypeError("Column is not a list type")
+        self._column = col
+
+    cpdef child(self):
+        return self._column.child(1)
+
+    cpdef offsets(self):
+        return self._column.child(1)
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
index 5e656744a8c..713697bd139 100644
--- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
@@ -2,5 +2,8 @@
 
 
 cdef class gpumemoryview:
-    cdef Py_ssize_t ptr
-    cdef object obj
+    # TODO: Eventually probably want to make this opaque, but for now it's fine
+    # to treat this object as something like a POD struct
+    cdef readonly:
+        Py_ssize_t ptr
+        object obj
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
index 4f189f2c398..95f197b13eb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -8,9 +8,11 @@ from cudf._lib.cpp.table.table_view cimport table_view
 
 cdef class Table:
     # List[pylibcudf.Column]
-    cdef object columns
+    cdef list _columns
 
     cdef table_view view(self) nogil
 
     @staticmethod
     cdef Table from_libcudf(unique_ptr[table] libcudf_tbl)
+
+    cpdef list columns(self)
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index db422dd420b..720f9815bd6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -21,7 +21,7 @@ cdef class Table:
         The columns in this table.
     """
     def __init__(self, list columns):
-        self.columns = columns
+        self._columns = columns
 
     cdef table_view view(self) nogil:
         """Generate a libcudf table_view to pass to libcudf algorithms.
@@ -31,11 +31,11 @@ cdef class Table:
         (even direct pylibcudf Cython users).
         """
         # TODO: Make c_columns a class attribute that is updated along with
-        # self.columns whenever new columns are added or columns are removed.
+        # self._columns whenever new columns are added or columns are removed.
         cdef vector[column_view] c_columns
 
         with gil:
-            for col in self.columns:
+            for col in self._columns:
                 c_columns.push_back((<Column> col).view())
 
         return table_view(c_columns)
@@ -57,3 +57,6 @@ cdef class Table:
             Column.from_libcudf(move(c_columns[i]))
             for i in range(c_columns.size())
         ])
+
+    cpdef list columns(self):
+        return self._columns
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index 1eeaa23c260..a95db84ceff 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -4,7 +4,6 @@ from libc.stdint cimport int32_t
 from libcpp cimport bool
 
 cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib cimport pylibcudf
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 
@@ -18,5 +17,5 @@ ctypedef bool underlying_type_t_null_policy
 cdef dtype_from_column_view(column_view cv)
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
-cpdef pylibcudf.DataType dtype_to_pylibcudf_type(dtype)
+cpdef dtype_to_pylibcudf_type(dtype)
 cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index c3eca8090a3..8594e37ac4a 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -17,7 +17,6 @@ from cudf._lib.types cimport (
 
 import cudf
 from cudf._lib import pylibcudf
-from cudf._lib cimport pylibcudf
 
 size_type_dtype = np.dtype("int32")
 
@@ -129,6 +128,11 @@ LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
     TypeId.STRUCT: np.dtype("object"),
 }
 
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
+    pylibcudf.TypeId(k).value: v
+    for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items()
+}
+
 duration_unit_map = {
     TypeId.DURATION_SECONDS: "s",
     TypeId.DURATION_MILLISECONDS: "ms",
@@ -252,7 +256,7 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
     else:
         return libcudf_types.data_type(tid)
 
-cpdef pylibcudf.DataType dtype_to_pylibcudf_type(dtype):
+cpdef dtype_to_pylibcudf_type(dtype):
     if cudf.api.types.is_list_dtype(dtype):
         return pylibcudf.DataType(pylibcudf.TypeId.LIST)
     elif cudf.api.types.is_struct_dtype(dtype):
@@ -275,3 +279,54 @@ cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *:
         libcudf_types.type_id.DECIMAL64,
         libcudf_types.type_id.DECIMAL32,
     )
+
+
+def dtype_from_pylibcudf_lists_column(col):
+    child = col.list_view().child()
+    tid = child.type().id()
+
+    if tid == pylibcudf.TypeId.LIST:
+        return cudf.ListDtype(dtype_from_pylibcudf_lists_column(child))
+    elif tid == pylibcudf.TypeId.EMPTY:
+        return cudf.ListDtype("int8")
+    else:
+        return cudf.ListDtype(
+            dtype_from_pylibcudf_column(child)
+        )
+
+
+def dtype_from_pylibcudf_structs_column(col):
+    fields = {
+        str(i): dtype_from_pylibcudf_column(col.child(i))
+        for i in range(col.num_children())
+    }
+    return cudf.StructDtype(fields)
+
+
+def dtype_from_pylibcudf_column(col):
+    type_ = col.type()
+    tid = type_.id()
+
+    if tid == pylibcudf.TypeId.LIST:
+        return dtype_from_pylibcudf_lists_column(col)
+    elif tid == pylibcudf.TypeId.STRUCT:
+        return dtype_from_pylibcudf_structs_column(col)
+    elif tid == pylibcudf.TypeId.DECIMAL64:
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL32:
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    else:
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+            <underlying_type_t_type_id>(tid)
+        ]
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index f2cdc110b64..653fa8f2b8b 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -4,7 +4,6 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib cimport pylibcudf
 from cudf._lib.cpp.column.column cimport column_view
 from cudf._lib.cpp.table.table cimport table, table_view
 
@@ -19,4 +18,4 @@ cdef table_view table_view_from_columns(columns) except *
 cdef table_view table_view_from_table(tbl, ignore_index=*) except*
 cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
 cdef columns_from_table_view(table_view tv, object owners)
-cdef columns_from_pylibcudf_table(pylibcudf.Table table)
+cdef columns_from_pylibcudf_table(tbl)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 8907143c289..03982a58517 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -11,7 +11,6 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib cimport pylibcudf
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column, column_view
 from cudf._lib.cpp.table.table cimport table
@@ -247,7 +246,7 @@ cdef columns_from_unique_ptr(
     return columns
 
 
-cdef columns_from_pylibcudf_table(pylibcudf.Table tbl):
+cdef columns_from_pylibcudf_table(tbl):
     """Convert a pylibcudf table into list of columns.
 
     Parameters
@@ -260,8 +259,7 @@ cdef columns_from_pylibcudf_table(pylibcudf.Table tbl):
     list[Column]
         A list of columns.
     """
-    cdef pylibcudf.Column plc
-    return [Column.from_pylibcudf(plc) for plc in tbl.columns]
+    return [Column.from_pylibcudf(plc) for plc in tbl.columns()]
 
 
 cdef data_from_unique_ptr(

From 261bcb2a1f190ec3a6689258f548b1f2c3f49dd6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 21 Aug 2023 17:16:14 -0400
Subject: [PATCH 079/230] Add minhash support for MurmurHash3_x64_128 (#13796)

Adds `nvtext::minhash64` to libcudf and the Cython/Python changes to call it.
The `MurmurHash3_x64_128` is called and only the first `uint64` value is used.

The libcudf API was changed to remove the `hash_id` parameter since it was incompatible with the seed types.

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13796
---
 cpp/benchmarks/text/minhash.cpp               |  17 +-
 cpp/include/nvtext/minhash.hpp                |  90 +++++--
 cpp/src/text/minhash.cu                       | 250 ++++++++++++------
 cpp/tests/text/minhash_tests.cpp              |  67 ++++-
 python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd  |   8 +-
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  36 ++-
 python/cudf/cudf/_lib/strings/__init__.py     |   2 +-
 python/cudf/cudf/core/column/string.py        |  72 +++--
 .../cudf/cudf/tests/text/test_text_methods.py |  48 +++-
 9 files changed, 434 insertions(+), 156 deletions(-)

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index bcc254575c0..1b60caa24de 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -30,6 +30,7 @@ static void bench_minhash(nvbench::state& state)
   auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
   auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const base64     = state.get_int64("hash_type") == 64;
 
   if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
@@ -44,9 +45,9 @@ static void bench_minhash(nvbench::state& state)
 
   data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
     cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
-  auto const seeds_table = create_random_table(
-    {cudf::type_to_id<cudf::hash_value_type>()}, row_count{seed_count}, seeds_profile);
-  auto seeds = seeds_table->get_column(0);
+  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
+  auto seeds             = seeds_table->get_column(0);
   seeds.set_null_mask(rmm::device_buffer{}, 0);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -56,13 +57,15 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
+                         : nvtext::minhash(input, seeds.view(), hash_width);
   });
 }
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
+  .add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
   .add_int64_axis("row_width", {128, 512, 2048})
-  .add_int64_axis("hash_width", {5, 10, 25})
-  .add_int64_axis("seed_count", {2, 26});
+  .add_int64_axis("hash_width", {5, 10})
+  .add_int64_axis("seed_count", {2, 26})
+  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index dda23a2ba5b..47c625b5079 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -36,24 +36,24 @@ namespace nvtext {
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
  * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if hash_function is not HASH_MURMUR3
  *
  * @param input Strings column to compute minhash
- * @param seed  Seed value used for the MurmurHash3_x86_32 algorithm
+ * @param seed  Seed value used for the hash algorithm
  * @param width The character width used for apply substrings;
  *              Default is 4 characters.
- * @param hash_function Hash algorithm to use;
- *                      Only HASH_MURMUR3 is currently supported.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values for each string in input
  */
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
-  cudf::numeric_scalar<cudf::hash_value_type> seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED),
-  cudf::size_type width                            = 4,
-  cudf::hash_id hash_function                      = cudf::hash_id::HASH_MURMUR3,
-  rmm::mr::device_memory_resource* mr              = rmm::mr::get_current_device_resource());
+  cudf::numeric_scalar<uint32_t> seed = 0,
+  cudf::size_type width               = 4,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -64,28 +64,88 @@ std::unique_ptr<cudf::column> minhash(
  * string. The order of the elements in each row match the order of
  * the seeds provided in the `seeds` parameter.
  *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
  * Any null row entries result in corresponding null output rows.
  *
  * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if hash_function is not HASH_MURMUR3
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
- * @param seeds Seed values used for the MurmurHash3_x86_32 algorithm
+ * @param seeds Seed values used for the hash algorithm
  * @param width The character width used for apply substrings;
  *              Default is 4 characters.
- * @param hash_function Hash algorithm to use;
- *                      Only HASH_MURMUR3 is currently supported.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
- *         or a hash_value_type column if only a single seed is specified
  */
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
-  cudf::device_span<cudf::hash_value_type const> seeds,
+  cudf::device_span<uint32_t const> seeds,
+  cudf::size_type width               = 4,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the minhash value for each string
+ *
+ * Hash values are computed from substrings of each string and the
+ * minimum hash value is returned for each string.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ * The hash function returns 2 uint64 values but only the first value
+ * is used with the minhash calculation.
+ *
+ * @throw std::invalid_argument if the width < 2
+ *
+ * @param input Strings column to compute minhash
+ * @param seed  Seed value used for the hash algorithm
+ * @param width The character width used for apply substrings;
+ *              Default is 4 characters.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Minhash values as UINT64 for each string in input
+ */
+std::unique_ptr<cudf::column> minhash64(
+  cudf::strings_column_view const& input,
+  cudf::numeric_scalar<uint64_t> seed = 0,
+  cudf::size_type width               = 4,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the minhash values for each string per seed
+ *
+ * Hash values are computed from substrings of each string and the
+ * minimum hash value is returned for each string for each seed.
+ * Each row of the list column are seed results for the corresponding
+ * string. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param width The character width used for apply substrings;
+ *              Default is 4 characters.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash64(
+  cudf::strings_column_view const& input,
+  cudf::device_span<uint64_t const> seeds,
   cudf::size_type width               = 4,
-  cudf::hash_id hash_function         = cudf::hash_id::HASH_MURMUR3,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 05210b60154..f06eaa5b52c 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -35,8 +36,6 @@
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
 
 #include <limits>
 
@@ -51,79 +50,96 @@ namespace {
  *
  * This is a warp-per-string algorithm where parallel threads within a warp
  * work on substrings of a single string row.
+ *
+ * @tparam HashFunction hash function to use on each substring
+ *
+ * @param d_strings Strings column to process
+ * @param seeds Seeds for hashing each string
+ * @param width Substring window size in characters
+ * @param d_hashes Minhash output values for each string
  */
-struct minhash_fn {
-  cudf::column_device_view d_strings;
-  cudf::device_span<cudf::hash_value_type const> seeds;
-  cudf::size_type width;
-  cudf::hash_value_type* d_hashes;
-
-  __device__ void operator()(std::size_t idx)
-  {
-    auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
-    auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
-
-    if (d_strings.is_null(str_idx)) { return; }
-
-    auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
-    auto const d_output = d_hashes + (str_idx * seeds.size());
-
-    // initialize hashes output for this string
-    if (lane_idx == 0) {
-      auto const init = d_str.empty() ? 0 : std::numeric_limits<cudf::hash_value_type>::max();
-      thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
-    }
-    __syncwarp();
-
-    auto const begin = d_str.data() + lane_idx;
-    auto const end   = d_str.data() + d_str.size_bytes();
-
-    // each lane hashes 'width'  substrings of d_str
-    for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
-      if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
-      auto const check_str =  // used for counting 'width' characters
-        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
-      auto const [bytes, left] =
-        cudf::strings::detail::bytes_to_character_position(check_str, width);
-      if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
-
-      auto const hash_str = cudf::string_view(itr, bytes);
-      // hashing with each seed on the same section of the string is 10x faster than
-      // computing the substrings for each seed
-      for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
-        auto const hasher =
-          cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{seeds[seed_idx]};
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+__global__ void minhash_kernel(cudf::column_device_view const d_strings,
+                               cudf::device_span<hash_value_type const> seeds,
+                               cudf::size_type width,
+                               hash_value_type* d_hashes)
+{
+  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
+              static_cast<std::size_t>(cudf::detail::warp_size))) {
+    return;
+  }
+
+  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+
+  if (d_strings.is_null(str_idx)) { return; }
+
+  auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
+  auto const d_output = d_hashes + (str_idx * seeds.size());
+
+  // initialize hashes output for this string
+  if (lane_idx == 0) {
+    auto const init = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
+    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
+  }
+  __syncwarp();
+
+  auto const begin = d_str.data() + lane_idx;
+  auto const end   = d_str.data() + d_str.size_bytes();
+
+  // each lane hashes 'width' substrings of d_str
+  for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
+    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
+    auto const check_str =  // used for counting 'width' characters
+      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
+    if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
+
+    auto const hash_str = cudf::string_view(itr, bytes);
+    // hashing with each seed on the same section of the string is 10x faster than
+    // computing the substrings for each seed
+    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
+      auto const hasher = HashFunction(seeds[seed_idx]);
+      // hash substring and store the min value
+      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
         auto const hvalue = hasher(hash_str);
-        cuda::atomic_ref<cudf::hash_value_type, cuda::thread_scope_block> ref{
-          *(d_output + seed_idx)};
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
+      } else {
+        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
+        // but only uses the first uint64 value as requested by the LLM team.
+        auto const hvalue = thrust::get<0>(hasher(hash_str));
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
         ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
       }
     }
   }
-};
-
-}  // namespace
+}
 
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<cudf::hash_value_type const> seeds,
-                                      cudf::size_type width,
-                                      cudf::hash_id hash_function,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
+                                         cudf::device_span<hash_value_type const> seeds,
+                                         cudf::size_type width,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
   CUDF_EXPECTS(width >= 2,
                "Parameter width should be an integer value of 2 or greater",
                std::invalid_argument);
-  CUDF_EXPECTS(hash_function == cudf::hash_id::HASH_MURMUR3,
-               "Only murmur3 hash algorithm supported",
-               std::invalid_argument);
   CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
                  static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
                "The number of seeds times the number of input rows exceeds the column size limit",
                std::overflow_error);
 
-  auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
@@ -133,27 +149,26 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                           cudf::mask_state::UNALLOCATED,
                                           stream,
                                           mr);
-  auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
-
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator(std::size_t{0}),
-    static_cast<std::size_t>(input.size()) * static_cast<std::size_t>(cudf::detail::warp_size),
-    minhash_fn{*d_strings, seeds, width, d_hashes});
-
-  if (seeds.size() == 1) {
-    hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                          input.null_count());
-    return hashes;
-  }
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+
+  constexpr int block_size = 256;
+  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, seeds, width, d_hashes);
+
+  return hashes;
+}
 
+std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
+                                                std::unique_ptr<cudf::column>&& hashes,
+                                                cudf::size_type seeds_size,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
   // build the offsets for the output lists column
-  auto offsets = cudf::detail::sequence(
-    input.size() + 1,
-    cudf::numeric_scalar<cudf::size_type>(0),
-    cudf::numeric_scalar<cudf::size_type>(static_cast<cudf::size_type>(seeds.size())),
-    stream,
-    mr);
+  auto const zero = cudf::numeric_scalar<cudf::size_type>(0);
+  auto const size = cudf::numeric_scalar<cudf::size_type>(seeds_size);
+  auto offsets    = cudf::detail::sequence(input.size() + 1, zero, size, stream, mr);
   hashes->set_null_mask(rmm::device_buffer{}, 0);  // children have no nulls
 
   // build the lists column from the offsets and the hashes
@@ -170,28 +185,95 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   }
   return result;
 }
+}  // namespace
 
+std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      cudf::numeric_scalar<uint32_t> seed,
+                                      cudf::size_type width,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto const seeds   = cudf::device_span<uint32_t const>{seed.data(), 1};
+  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
+  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      cudf::device_span<uint32_t const> seeds,
+                                      cudf::size_type width,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
+  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::numeric_scalar<uint64_t> seed,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto const seeds   = cudf::device_span<uint64_t const>{seed.data(), 1};
+  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
+  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::device_span<uint64_t const> seeds,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
+  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+}
 }  // namespace detail
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::numeric_scalar<cudf::hash_value_type> seed,
+                                      cudf::numeric_scalar<uint32_t> seed,
                                       cudf::size_type width,
-                                      cudf::hash_id hash_function,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto seeds = cudf::device_span<cudf::hash_value_type const>{seed.data(), 1};
-  return detail::minhash(input, seeds, width, hash_function, cudf::get_default_stream(), mr);
+  return detail::minhash(input, seed, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<cudf::hash_value_type const> seeds,
+                                      cudf::device_span<uint32_t const> seeds,
                                       cudf::size_type width,
-                                      cudf::hash_id hash_function,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash(input, seeds, width, hash_function, cudf::get_default_stream(), mr);
+  return detail::minhash(input, seeds, width, stream, mr);
+}
+
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::numeric_scalar<uint64_t> seed,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64(input, seed, width, stream, mr);
+}
+
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::device_span<uint64_t const> seeds,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64(input, seeds, width, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index fa4e2a91600..b1c961ec9e1 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -34,6 +34,7 @@ struct MinHashTest : public cudf::test::BaseFixture {};
 
 TEST_F(MinHashTest, Basic)
 {
+  auto validity = cudf::test::iterators::null_at(1);
   auto input =
     cudf::test::strings_column_wrapper({"doc 1",
                                         "",
@@ -42,15 +43,26 @@ TEST_F(MinHashTest, Basic)
                                         "doc 3",
                                         "d",
                                         "The quick brown fox jumpéd over the lazy brown dog."},
-                                       {1, 0, 1, 1, 1, 1, 1});
+                                       validity);
 
   auto view = cudf::strings_column_view(input);
 
   auto results = nvtext::minhash(view);
 
-  auto expected = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>(
-    {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, {1, 0, 1, 1, 1, 1, 1});
+  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto results64  = nvtext::minhash64(view);
+  auto expected64 = cudf::test::fixed_width_column_wrapper<uint64_t>({774489391575805754ul,
+                                                                      0ul,
+                                                                      3232308021562742685ul,
+                                                                      0ul,
+                                                                      13145552576991307582ul,
+                                                                      14660046701545912182ul,
+                                                                      398062025280761388ul},
+                                                                     validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, LengthEqualsWidth)
@@ -58,7 +70,7 @@ TEST_F(MinHashTest, LengthEqualsWidth)
   auto input   = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"});
   auto view    = cudf::strings_column_view(input);
   auto results = nvtext::minhash(view, 0, 5);
-  auto expected = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>(
+  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
     {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
@@ -74,11 +86,10 @@ TEST_F(MinHashTest, MultiSeed)
 
   auto view = cudf::strings_column_view(input);
 
-  auto seeds = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>({0, 1, 2});
-
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({0, 1, 2});
   auto results = nvtext::minhash(view, cudf::column_view(seeds));
 
-  using LCW = cudf::test::lists_column_wrapper<cudf::hash_value_type>;
+  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
   LCW expected({LCW{1207251914u, 1677652962u, 1061355987u},
                 LCW{  21141582u,  580916568u, 1258052021u},
@@ -87,6 +98,19 @@ TEST_F(MinHashTest, MultiSeed)
                 LCW{  86520422u,  236622901u,  102546228u}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2});
+  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({LCW64{  774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul},
+                    LCW64{ 3232308021562742685ul,  4445611509348165860ul, 1188598072697676120ul},
+                    LCW64{13145552576991307582ul,  6846192680998069919ul, 1188598072697676120ul},
+                    LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul},
+                    LCW64{  398062025280761388ul,   377720198157450084ul,  984941365662009329ul}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, MultiSeedWithNullInputRow)
@@ -95,13 +119,24 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   auto input    = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity);
   auto view     = cudf::strings_column_view(input);
 
-  auto seeds   = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>({1, 2});
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
   auto results = nvtext::minhash(view, cudf::column_view(seeds));
 
-  using LCW = cudf::test::lists_column_wrapper<cudf::hash_value_type>;
+  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
   LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}},
                validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
+  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul},
+                    LCW64{},
+                    LCW64{0ul, 0ul},
+                    LCW64{2717781266371273264ul, 6977325820868387259ul}},
+                   validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, EmptyTest)
@@ -110,6 +145,8 @@ TEST_F(MinHashTest, EmptyTest)
   auto view    = cudf::strings_column_view(input->view());
   auto results = nvtext::minhash(view);
   EXPECT_EQ(results->size(), 0);
+  results = nvtext::minhash64(view);
+  EXPECT_EQ(results->size(), 0);
 }
 
 TEST_F(MinHashTest, ErrorsTest)
@@ -117,15 +154,19 @@ TEST_F(MinHashTest, ErrorsTest)
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
   EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument);
-  EXPECT_THROW(nvtext::minhash(view, 0, 0, cudf::hash_id::HASH_MD5), std::invalid_argument);
-  auto seeds = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>();
+  EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument);
+  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>();
   EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
+  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
+  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
   input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end());
   view  = cudf::strings_column_view(input);
 
-  auto const zeroes = thrust::constant_iterator<cudf::hash_value_type>(0);
-  seeds = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>(zeroes, zeroes + 50000);
+  auto const zeroes = thrust::constant_iterator<uint32_t>(0);
+  seeds             = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
   EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error);
+  seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error);
 }
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
index 0509083ae3b..08b3330953e 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
@@ -4,7 +4,6 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.hash cimport hash_id
 from cudf._lib.cpp.types cimport size_type
 
 
@@ -14,5 +13,10 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &strings,
         const column_view &seeds,
         const size_type width,
-        const hash_id hash_function
+    ) except +
+
+    cdef unique_ptr[column] minhash64(
+        const column_view &strings,
+        const column_view &seeds,
+        const size_type width,
     ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index f0b2c799912..6ed5ca834ee 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -8,31 +8,47 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.hash cimport hash_id as cpp_hash_id
-from cudf._lib.cpp.nvtext.minhash cimport minhash as cpp_minhash
+from cudf._lib.cpp.nvtext.minhash cimport (
+    minhash as cpp_minhash,
+    minhash64 as cpp_minhash64,
+)
 from cudf._lib.cpp.types cimport size_type
 
 
 @acquire_spill_lock()
-def minhash(Column strings, Column seeds, int width, str method):
+def minhash(Column strings, Column seeds, int width):
 
     cdef column_view c_strings = strings.view()
     cdef size_type c_width = width
     cdef column_view c_seeds = seeds.view()
     cdef unique_ptr[column] c_result
-    cdef cpp_hash_id c_hash_function
-    if method == "murmur3":
-        c_hash_function = cpp_hash_id.HASH_MURMUR3
-    else:
-        raise ValueError(f"Unsupported hash function: {method}")
 
     with nogil:
         c_result = move(
             cpp_minhash(
                 c_strings,
                 c_seeds,
-                c_width,
-                c_hash_function
+                c_width
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def minhash64(Column strings, Column seeds, int width):
+
+    cdef column_view c_strings = strings.view()
+    cdef size_type c_width = width
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_minhash64(
+                c_strings,
+                c_seeds,
+                c_width
             )
         )
 
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index fa51d78b5c4..16875e4397e 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -6,7 +6,7 @@
     hash_character_ngrams,
 )
 from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import minhash
+from cudf._lib.nvtext.minhash import minhash, minhash64
 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
 from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
 from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 1a7679e6336..fe21dc87bac 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5287,26 +5287,20 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
         )
 
     def minhash(
-        self,
-        seeds: Optional[cudf.Series] = None,
-        n: int = 4,
-        method: str = "murmur3",
+        self, seeds: Optional[ColumnLike] = None, width: int = 4
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
 
         Parameters
         ----------
-        seeds : Series
+        seeds : ColumnLike
             The seeds used for the hash algorithm.
             Must be of type uint32.
-        n : int
+        width : int
             The width of the substring to hash.
             Default is 4 characters.
-        method : str
-            Hash function to use.
-            Only 'murmur3' (MurmurHash3_32) is supported.
-            Default is 'murmur3'.
 
         Examples
         --------
@@ -5314,9 +5308,9 @@ def minhash(
         >>> str_series = cudf.Series(['this is my', 'favorite book'])
         >>> seeds = cudf.Series([0], dtype=np.uint32)
         >>> str_series.str.minhash(seeds)
-        0     21141582
-        1    962346254
-        dtype: uint32
+        0     [21141582]
+        1    [962346254]
+        dtype: list
         >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
         >>> str_series.str.minhash(seeds)
         0    [21141582, 403093213, 1258052021]
@@ -5325,14 +5319,54 @@ def minhash(
         """
         if seeds is None:
             seeds_column = column.as_column(0, dtype=np.uint32, length=1)
-        elif isinstance(seeds, cudf.Series) and seeds.dtype == np.uint32:
-            seeds_column = seeds._column
         else:
-            raise ValueError(
-                f"Expecting a Series with dtype uint32, got {type(seeds)}"
-            )
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint32:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.minhash(self._column, seeds_column, width)
+        )
+
+    def minhash64(
+        self, seeds: Optional[ColumnLike] = None, width: int = 4
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+        This function generates 2 uint64 values but only the first
+        uint64 value is used.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint64.
+        width : int
+            The width of the substring to hash.
+            Default is 4 characters.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> str_series = cudf.Series(['this is my', 'favorite book'])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+        >>> str_series.str.minhash64(seeds)
+        0    [3232308021562742685, 4445611509348165860, 586435843695903598]
+        1    [23008204270530356, 1281229757012344693, 153762819128779913]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint64:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
+                )
         return self._return_or_inplace(
-            libstrings.minhash(self._column, seeds_column, n, method)
+            libstrings.minhash64(self._column, seeds_column, width)
         )
 
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index e1dda1ae5d1..8cda15e4acc 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -836,7 +836,15 @@ def test_is_vowel_consonant():
 
 def test_minhash():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
-    expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32)
+
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582], dtype=np.uint32),
+            cudf.Series([962346254], dtype=np.uint32),
+            None,
+            cudf.Series([0], dtype=np.uint32),
+        ]
+    )
     actual = strings.str.minhash()
     assert_eq(expected, actual)
     seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
@@ -848,16 +856,46 @@ def test_minhash():
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash(seeds=seeds, n=5)
+    actual = strings.str.minhash(seeds=seeds, width=5)
     assert_eq(expected, actual)
 
+    expected = cudf.Series(
+        [
+            cudf.Series([3232308021562742685], dtype=np.uint64),
+            cudf.Series([23008204270530356], dtype=np.uint64),
+            None,
+            cudf.Series([0], dtype=np.uint64),
+        ]
+    )
+    actual = strings.str.minhash64()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    expected = cudf.Series(
+        [
+            cudf.Series(
+                [7082801294247314046, 185949556058924788, 167570629329462454],
+                dtype=np.uint64,
+            ),
+            cudf.Series(
+                [382665377781028452, 86243762733551437, 7688750597953083512],
+                dtype=np.uint64,
+            ),
+            None,
+            cudf.Series([0, 0, 0], dtype=np.uint64),
+        ]
+    )
+    actual = strings.str.minhash64(seeds=seeds, width=5)
+    assert_eq(expected, actual)
+
+    # test wrong seed types
     with pytest.raises(ValueError):
-        strings.str.minhash(seeds=7)
-    with pytest.raises(ValueError):
-        strings.str.minhash(seeds=seeds, method="md5")
+        strings.str.minhash(seeds="a")
     with pytest.raises(ValueError):
         seeds = cudf.Series([0, 1, 2], dtype=np.int32)
         strings.str.minhash(seeds=seeds)
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        strings.str.minhash64(seeds=seeds)
 
 
 def test_jaccard_index():

From 595308b528dbcc6d409c28aa11d2f8c6fe1886ed Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 22 Aug 2023 08:02:35 -0500
Subject: [PATCH 080/230] Add java API to get size of host memory needed to
 copy column view (#13919)

To help with work for host memory management in java this provides an API to know how much memory is needed on the host to copy the data before it happens.

This was written by @jbrennan333 but I am taking over the patch to get it in.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Gera Shegalov (https://github.com/gerashegalov)
  - Raza Jafri (https://github.com/razajafri)

URL: https://github.com/rapidsai/cudf/pull/13919
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 17 ++++++++--
 java/src/main/native/src/ColumnViewJni.cpp    | 31 ++++++++++++-----
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 33 ++++++++++++++-----
 3 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 0a7346d1cbc..7db40278d4e 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -307,7 +307,7 @@ public final int getNumChildren() {
    * Returns the amount of device memory used.
    */
   public long getDeviceMemorySize() {
-    return getDeviceMemorySize(getNativeView());
+    return getDeviceMemorySize(getNativeView(), false);
   }
 
   @Override
@@ -4789,7 +4789,7 @@ static native long makeCudfColumnView(int type, int scale, long data, long dataS
   static native int getNativeNumChildren(long viewHandle) throws CudfException;
 
   // calculate the amount of device memory used by this column including any child columns
-  static native long getDeviceMemorySize(long viewHandle) throws CudfException;
+  static native long getDeviceMemorySize(long viewHandle, boolean shouldPadForCpu) throws CudfException;
 
   static native long copyColumnViewToCV(long viewHandle) throws CudfException;
 
@@ -5160,6 +5160,19 @@ public HostColumnVector copyToHost() {
     }
   }
 
+  /**
+   * Calculate the total space required to copy the data to the host. This should be padded to
+   * the alignment that the CPU requires.
+   */
+  public long getHostBytesRequired() {
+    return getDeviceMemorySize(getNativeView(), true);
+  }
+
+  /**
+   * Get the size that the host will align memory allocations to in bytes.
+   */
+  public static native long hostPaddingSizeInBytes();
+
   /**
    * Exact check if a column or its descendants have non-empty null rows
    *
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 1cb51a22bf3..d5aad03645f 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -91,22 +91,32 @@ using cudf::jni::release_as_jlong;
 
 namespace {
 
-std::size_t calc_device_memory_size(cudf::column_view const &view) {
+std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu) {
+  if (should_pad_for_cpu) {
+    constexpr std::size_t ALIGN = sizeof(std::max_align_t);
+    return (size + (ALIGN - 1)) & ~(ALIGN - 1);
+  } else {
+    return size;
+  }
+}
+
+std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pad_for_cpu) {
   std::size_t total = 0;
   auto row_count = view.size();
 
   if (view.nullable()) {
-    total += cudf::bitmask_allocation_size_bytes(row_count);
+    total += pad_size(cudf::bitmask_allocation_size_bytes(row_count), pad_for_cpu);
   }
 
   auto dtype = view.type();
   if (cudf::is_fixed_width(dtype)) {
-    total += cudf::size_of(dtype) * view.size();
+    total += pad_size(cudf::size_of(dtype) * view.size(), pad_for_cpu);
   }
 
-  return std::accumulate(
-      view.child_begin(), view.child_end(), total,
-      [](std::size_t t, cudf::column_view const &v) { return t + calc_device_memory_size(v); });
+  return std::accumulate(view.child_begin(), view.child_end(), total,
+                         [pad_for_cpu](std::size_t t, cudf::column_view const &v) {
+                           return t + calc_device_memory_size(v, pad_for_cpu);
+                         });
 }
 
 } // anonymous namespace
@@ -2217,16 +2227,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass,
-                                                                           jlong handle) {
+                                                                           jlong handle,
+                                                                           jboolean pad_for_cpu) {
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto view = reinterpret_cast<cudf::column_view const *>(handle);
-    return calc_device_memory_size(*view);
+    return calc_device_memory_size(*view, pad_for_cpu);
   }
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hostPaddingSizeInBytes(JNIEnv *env, jclass) {
+  return sizeof(std::max_align_t);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobject j_object,
                                                                jlong handle, jlong j_lo_scalar,
                                                                jlong j_lo_replace_scalar,
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 0e1fbad6129..1062a765800 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1026,21 +1026,36 @@ void decimal128Cv() {
     }
   }
 
+  static final long HOST_ALIGN_BYTES = ColumnView.hostPaddingSizeInBytes();
+
+  static void assertHostAligned(long expectedDeviceSize, ColumnView cv) {
+      long deviceSize = cv.getDeviceMemorySize();
+      assertEquals(expectedDeviceSize, deviceSize);
+      long hostSize = cv.getHostBytesRequired();
+      assert(hostSize >= deviceSize);
+      long roundedHostSize = (hostSize / HOST_ALIGN_BYTES) * HOST_ALIGN_BYTES;
+      assertEquals(hostSize, roundedHostSize, "The host size should be a multiple of " +
+              HOST_ALIGN_BYTES);
+  }
+
   @Test
   void testGetDeviceMemorySizeNonStrings() {
     try (ColumnVector v0 = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6);
          ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, 3, null, null, 4, 5, 6)) {
-      assertEquals(24, v0.getDeviceMemorySize()); // (6*4B)
-      assertEquals(96, v1.getDeviceMemorySize()); // (8*4B) + 64B(for validity vector)
+      assertHostAligned(24, v0); // (6*4B)
+      assertHostAligned(96, v1); // (8*4B) + 64B(for validity vector)
     }
   }
 
   @Test
   void testGetDeviceMemorySizeStrings() {
+    if (ColumnView.hostPaddingSizeInBytes() != 8) {
+      System.err.println("HOST PADDING SIZE: " + ColumnView.hostPaddingSizeInBytes());
+    }
     try (ColumnVector v0 = ColumnVector.fromStrings("onetwothree", "four", "five");
          ColumnVector v1 = ColumnVector.fromStrings("onetwothree", "four", null, "five")) {
-      assertEquals(35, v0.getDeviceMemorySize()); //19B data + 4*4B offsets = 35
-      assertEquals(103, v1.getDeviceMemorySize()); //19B data + 5*4B + 64B validity vector = 103B
+      assertHostAligned(35, v0); //19B data + 4*4B offsets = 35
+      assertHostAligned(103, v1); //19B data + 5*4B + 64B validity vector = 103B
     }
   }
 
@@ -1061,13 +1076,13 @@ void testGetDeviceMemorySizeLists() {
       // 64 bytes for validity of list column
       // 16 bytes for offsets of list column
       // 64 bytes for validity of string column
-      // 24 bytes for offsets of of string column
+      // 24 bytes for offsets of string column
       // 22 bytes of string character size
-      assertEquals(64+16+64+24+22, sv.getDeviceMemorySize());
+      assertHostAligned(64+16+64+24+22, sv);
 
       // 20 bytes for offsets of list column
       // 28 bytes for data of INT32 column
-      assertEquals(20+28, iv.getDeviceMemorySize());
+      assertHostAligned(20+28, iv);
     }
   }
 
@@ -1091,11 +1106,11 @@ void testGetDeviceMemorySizeStructs() {
       // 64 bytes for validity of list column
       // 20 bytes for offsets of list column
       // 64 bytes for validity of string column
-      // 28 bytes for offsets of of string column
+      // 28 bytes for offsets of string column
       // 22 bytes of string character size
       // 64 bytes for validity of int64 column
       // 28 bytes for data of the int64 column
-      assertEquals(64+64+20+64+28+22+64+28, v.getDeviceMemorySize());
+      assertHostAligned(64+64+20+64+28+22+64+28, v);
     }
   }
 

From 0d90b8e33472035a1024169fbca598cfa96f4d9d Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Tue, 22 Aug 2023 10:38:24 -0500
Subject: [PATCH 081/230] Make HostColumnVector.getRefCount public (#13934)

This is a small PR to make HostColumnVector.getRefCount public.

This is useful in figuring out spillability for a `HostColumnVector` (plugin issue: https://github.com/NVIDIA/spark-rapids/issues/8882)

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/13934
---
 java/src/main/java/ai/rapids/cudf/HostColumnVector.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 7993989825d..3e4baf962bc 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -199,7 +199,7 @@ private synchronized HostColumnVector incRefCountInternal(boolean isFirstTime) {
   /**
    * Returns this column's current refcount
    */
-  synchronized int getRefCount() {
+  public synchronized int getRefCount() {
     return refCount;
   }
 

From 0e5f9dbac252cd4a59b7d33967b8df4acbf99571 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 22 Aug 2023 11:44:56 -0500
Subject: [PATCH 082/230] Fix handling of typecasting in `searchsorted`
 (#13925)

Fixes: #13902

This PR fixes a type-casting issue with `searchsorted` where typecast was done to the `values` dtype instead of inspecting both input and values columns and converting them to common dtypes.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13925
---
 python/cudf/cudf/core/frame.py          | 19 ++++++++++++++++---
 python/cudf/cudf/tests/test_indexing.py | 13 +++++++++++++
 python/cudf/cudf/tests/test_search.py   | 14 +++++++++++++-
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 69757fe900d..b9f052e7626 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1409,12 +1409,25 @@ def searchsorted(
         if len(values) != len(self._data):
             raise ValueError("Mismatch number of columns to search for.")
 
+        # TODO: Change behavior based on the decision in
+        # https://github.com/pandas-dev/pandas/issues/54668
+        common_dtype_list = [
+            find_common_type([col.dtype, val.dtype])
+            for col, val in zip(self._columns, values)
+        ]
         sources = [
             col
-            if is_dtype_equal(col.dtype, val.dtype)
-            else col.astype(val.dtype)
-            for col, val in zip(self._columns, values)
+            if is_dtype_equal(col.dtype, common_dtype)
+            else col.astype(common_dtype)
+            for col, common_dtype in zip(self._columns, common_dtype_list)
         ]
+        values = [
+            val
+            if is_dtype_equal(val.dtype, common_dtype)
+            else val.astype(common_dtype)
+            for val, common_dtype in zip(values, common_dtype_list)
+        ]
+
         outcol = libcudf.search.search_sorted(
             sources,
             values,
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index a2398c3b705..d747ed13e27 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2067,3 +2067,16 @@ def test_loc_index_notinindex_slice(
             expect = pdf.loc[lo:hi:take_order]
             actual = df.loc[lo:hi:take_order]
             assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize(
+    "arg", [slice(2, 4), slice(2, 5), slice(2.3, 5), slice(4.6, 6)]
+)
+def test_series_iloc_float_int(arg):
+    gs = cudf.Series(range(4), index=[2.0, 3.0, 4.5, 5.5])
+    ps = gs.to_pandas()
+
+    actual = gs.loc[arg]
+    expected = ps.loc[arg]
+
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index d3433a589a7..b0eacb1a709 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 import cupy
 import numpy as np
 import pandas as pd
@@ -156,3 +156,15 @@ def test_searchsorted_misc():
         psr.searchsorted([-100, 3.00001, 2.2, 2.0, 2.000000001]),
         sr.searchsorted([-100, 3.00001, 2.2, 2.0, 2.000000001]),
     )
+
+
+@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/54668")
+def test_searchsorted_mixed_str_int():
+    psr = pd.Series([1, 2, 3], dtype="int")
+    sr = cudf.from_pandas(psr)
+
+    with pytest.raises(ValueError):
+        actual = sr.searchsorted("a")
+    with pytest.raises(ValueError):
+        expect = psr.searchsorted("a")
+    assert_eq(expect, actual)

From 0fe00d2bdaf962e732314fad85d074a9923dc348 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 22 Aug 2023 16:14:55 -0500
Subject: [PATCH 083/230] Unpin `dask` and `distributed` for `23.10`
 development (#13935)

This PR unpins `dask` and `distributed` to use nightly builds for `23.10` development.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - https://github.com/jakirkham
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/cudf/pull/13935
---
 ci/test_wheel_dask_cudf.sh                       |  2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml |  6 +++---
 conda/environments/all_cuda-120_arch-x86_64.yaml |  6 +++---
 conda/recipes/custreamz/meta.yaml                |  6 +++---
 conda/recipes/dask-cudf/meta.yaml                | 12 ++++++------
 conda/recipes/dask-cudf/run_test.sh              |  2 +-
 dependencies.yaml                                |  6 +++---
 python/dask_cudf/pyproject.toml                  |  4 ++--
 8 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 38fce04459e..d6e7f4bf65e 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -11,7 +11,7 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@2023.7.1 git+https://github.com/dask/distributed.git@2023.7.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e9f97a63db7..e4a9b2f1d29 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,10 +24,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core==2023.7.1
+- dask-core>=2023.7.1
 - dask-cuda==23.10.*
-- dask==2023.7.1
-- distributed==2023.7.1
+- dask>=2023.7.1
+- distributed>=2023.7.1
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.8.20
 - fastavro>=0.22.9
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index dd85db528a6..d03c4364435 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -25,10 +25,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core==2023.7.1
+- dask-core>=2023.7.1
 - dask-cuda==23.10.*
-- dask==2023.7.1
-- distributed==2023.7.1
+- dask>=2023.7.1
+- distributed>=2023.7.1
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.8.20
 - fastavro>=0.22.9
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 499a4e99fc0..7aaa40bffd0 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,9 +45,9 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask ==2023.7.1
-    - dask-core ==2023.7.1
-    - distributed ==2023.7.1
+    - dask >=2023.7.1
+    - dask-core >=2023.7.1
+    - distributed >=2023.7.1
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index a2aead271b2..12809ba648f 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -38,16 +38,16 @@ requirements:
   host:
     - python
     - cudf ={{ version }}
-    - dask ==2023.7.1
-    - dask-core ==2023.7.1
-    - distributed ==2023.7.1
+    - dask >=2023.7.1
+    - dask-core >=2023.7.1
+    - distributed >=2023.7.1
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask ==2023.7.1
-    - dask-core ==2023.7.1
-    - distributed ==2023.7.1
+    - dask >=2023.7.1
+    - dask-core >=2023.7.1
+    - distributed >=2023.7.1
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index bb00f9cf8bc..7dc54747a0c 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -18,7 +18,7 @@ if [ "${ARCH}" = "aarch64" ]; then
 fi
 
 # Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
 export DASK_STABLE_VERSION="2023.7.1"
diff --git a/dependencies.yaml b/dependencies.yaml
index 79db1f16947..a1d928797b0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -483,12 +483,12 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask==2023.7.1
-          - distributed==2023.7.1
+          - dask>=2023.7.1
+          - distributed>=2023.7.1
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core==2023.7.1  # dask-core in conda is the actual package & dask is the meta package
+          - dask-core>=2023.7.1  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
           - &cudf cudf==23.10.*
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 7c4b56b3245..2464abca71a 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -20,8 +20,8 @@ requires-python = ">=3.9"
 dependencies = [
     "cudf==23.10.*",
     "cupy-cuda11x>=12.0.0",
-    "dask==2023.7.1",
-    "distributed==2023.7.1",
+    "dask>=2023.7.1",
+    "distributed>=2023.7.1",
     "fsspec>=0.6.0",
     "numpy>=1.21",
     "pandas>=1.3,<1.6.0dev0",

From 62148b42718e6ddd466e4f31a03d8c98a48ed191 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 22 Aug 2023 17:15:10 -0500
Subject: [PATCH 084/230] Fix an issue with `loc` when column names is
 `MultiIndex` (#13929)

Fixes: #13864

This PR fixes an issue with `loc` indexer where some special handling needs to be done when `columns` is of type `MultiIndex`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13929
---
 python/cudf/cudf/core/column_accessor.py |  2 +-
 python/cudf/cudf/core/dataframe.py       | 24 ++++++++++++++++-------
 python/cudf/cudf/tests/test_indexing.py  | 25 ++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index ec854cb977d..bec9c367ba9 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -511,7 +511,7 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
     def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
         result = self._grouped_data[key]
         if isinstance(result, cudf.core.column.ColumnBase):
-            return self.__class__({key: result})
+            return self.__class__({key: result}, multiindex=self.multiindex)
         else:
             if self.multiindex:
                 result = _to_flat_dict(result)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fc624c0b8eb..c80c2a7272e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -173,13 +173,13 @@ def _can_downcast_to_series(self, df, arg):
             all_numeric = all(is_numeric_dtype(t) for t in dtypes)
             if all_numeric:
                 return True
+            if isinstance(arg[1], tuple):
+                return True
         if ncols == 1:
             if type(arg[1]) is slice:
                 return False
             if isinstance(arg[1], tuple):
-                # Multiindex indexing with a slice
-                if any(isinstance(v, slice) for v in arg):
-                    return False
+                return len(arg[1]) == df._data.nlevels
             if not (is_list_like(arg[1]) or is_column_like(arg[1])):
                 return True
         return False
@@ -193,7 +193,10 @@ def _downcast_to_series(self, df, arg):
         nrows, ncols = df.shape
         # determine the axis along which the Series is taken:
         if nrows == 1 and ncols == 1:
-            if is_scalar(arg[0]) and is_scalar(arg[1]):
+            if is_scalar(arg[0]) and (
+                is_scalar(arg[1])
+                or (df._data.multiindex and arg[1] in df._column_names)
+            ):
                 return df[df._column_names[0]].iloc[0]
             elif not is_scalar(arg[0]):
                 axis = 1
@@ -288,13 +291,20 @@ def _getitem_tuple_arg(self, arg):
                     )
                 else:
                     tmp_col_name = str(uuid4())
+                    cantor_name = "_" + "_".join(
+                        map(str, columns_df._data.names)
+                    )
+                    if columns_df._data.multiindex:
+                        # column names must be appropriate length tuples
+                        extra = tuple(
+                            "" for _ in range(columns_df._data.nlevels - 1)
+                        )
+                        tmp_col_name = (tmp_col_name, *extra)
+                        cantor_name = (cantor_name, *extra)
                     other_df = DataFrame(
                         {tmp_col_name: column.arange(len(tmp_arg[0]))},
                         index=as_index(tmp_arg[0]),
                     )
-                    cantor_name = "_" + "_".join(
-                        map(str, columns_df._data.names)
-                    )
                     columns_df[cantor_name] = column.arange(len(columns_df))
                     df = other_df.join(columns_df, how="inner")
                     # as join is not assigning any names to index,
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index d747ed13e27..2e169a2b0b1 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2069,6 +2069,31 @@ def test_loc_index_notinindex_slice(
             assert_eq(expect, actual)
 
 
+@pytest.mark.parametrize(
+    "arg",
+    [
+        (2, ("one", "second")),
+        (slice(None, None, None), ("two", "first")),
+        (1, ("one", "first")),
+        (slice(None, None, None), ("two", "second")),
+        (slice(None, None, None), ("two", "first", "three")),
+        (3, ("two", "first", "three")),
+        (slice(None, None, None), ("two",)),
+        (0, ("two",)),
+    ],
+)
+def test_loc_dataframe_column_multiindex(arg):
+    gdf = cudf.DataFrame(
+        [list("abcd"), list("efgh"), list("ijkl"), list("mnop")],
+        columns=cudf.MultiIndex.from_product(
+            [["one", "two"], ["first", "second"], ["three"]]
+        ),
+    )
+    pdf = gdf.to_pandas()
+
+    assert_eq(gdf.loc[arg], pdf.loc[arg])
+
+
 @pytest.mark.parametrize(
     "arg", [slice(2, 4), slice(2, 5), slice(2.3, 5), slice(4.6, 6)]
 )

From 88a8efcb5202a14c34088af4a4a49ba0e5a0318f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 23 Aug 2023 11:29:37 -0400
Subject: [PATCH 085/230] Fix memory access error in cudf::shift for sliced
 strings (#13894)

Fixes `cudf::strings::detail::shift` logic with sliced input strings column when copying the chars data to the output column.
Added additional tests including a null fill scalar.

Closes #13852

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/13894
---
 cpp/src/strings/copying/shift.cu  | 84 ++++++++++++++-----------------
 cpp/tests/copying/shift_tests.cpp | 38 +++++++++++---
 2 files changed, 69 insertions(+), 53 deletions(-)

diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 5f8fc483a34..b54c433c23d 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -18,6 +18,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 
@@ -31,35 +33,31 @@ namespace cudf::strings::detail {
 
 namespace {
 
-struct adjust_offsets_fn {
-  column_device_view const d_column;
+struct output_sizes_fn {
+  column_device_view const d_column;  // input strings column
   string_view const d_filler;
   size_type const offset;
 
+  __device__ size_type get_string_size_at(size_type idx)
+  {
+    return d_column.is_null(idx) ? 0 : d_column.element<string_view>(idx).size_bytes();
+  }
+
   __device__ size_type operator()(size_type idx)
   {
+    auto const last_index = offset < 0 ? d_column.size() + offset : offset;
     if (offset < 0) {
-      auto const first      = d_column.element<size_type>(-offset);
-      auto const last_index = d_column.size() + offset;
-      if (idx < last_index) {
-        return d_column.element<size_type>(idx - offset) - first;
-      } else {
-        auto const last = d_column.element<size_type>(d_column.size() - 1);
-        return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes());
-      }
+      // shift left:  a,b,c,d,e,f -> b,c,d,e,f,x
+      return (idx < last_index) ? get_string_size_at(idx - offset) : d_filler.size_bytes();
     } else {
-      if (idx < offset) {
-        return idx * d_filler.size_bytes();
-      } else {
-        auto const total_filler = d_filler.size_bytes() * offset;
-        return total_filler + d_column.element<size_type>(idx - offset);
-      }
+      // shift right:  a,b,c,d,e,f -> x,a,b,c,d,e
+      return (idx < last_index) ? d_filler.size_bytes() : get_string_size_at(idx - offset);
     }
   }
 };
 
 struct shift_chars_fn {
-  column_device_view const d_column;
+  column_device_view const d_column;  // input strings column
   string_view const d_filler;
   size_type const offset;
 
@@ -68,8 +66,11 @@ struct shift_chars_fn {
     if (offset < 0) {
       auto const last_index = -offset;
       if (idx < last_index) {
-        auto const first_index = d_column.size() + offset;
-        return d_column.element<char>(idx + first_index);
+        auto const first_index =
+          offset + d_column.child(strings_column_view::offsets_column_index)
+                     .element<size_type>(d_column.offset() + d_column.size());
+        return d_column.child(strings_column_view::chars_column_index)
+          .element<char>(idx + first_index);
       } else {
         auto const char_index = idx - last_index;
         return d_filler.data()[char_index % d_filler.size_bytes()];
@@ -78,7 +79,10 @@ struct shift_chars_fn {
       if (idx < offset) {
         return d_filler.data()[idx % d_filler.size_bytes()];
       } else {
-        return d_column.element<char>(idx - offset);
+        return d_column.child(strings_column_view::chars_column_index)
+          .element<char>(idx - offset +
+                         d_column.child(strings_column_view::offsets_column_index)
+                           .element<size_type>(d_column.offset()));
       }
     }
   }
@@ -97,44 +101,30 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   // adjust offset when greater than the size of the input
   if (std::abs(offset) > input.size()) { offset = input.size(); }
 
-  // output offsets column is the same size as the input
-  auto const input_offsets =
-    cudf::detail::slice(
-      input.offsets(), {input.offset(), input.offset() + input.size() + 1}, stream)
-      .front();
-  auto const offsets_size = input_offsets.size();
-  auto offsets_column     = cudf::detail::allocate_like(
-    input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr);
-
-  // run kernel to simultaneously shift and adjust the values in the output offsets column
-  auto d_offsets = mutable_column_device_view::create(offsets_column->mutable_view(), stream);
-  auto const d_input_offsets = column_device_view::create(input_offsets, stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<size_type>(0),
-                    thrust::counting_iterator<size_type>(offsets_size),
-                    d_offsets->data<size_type>(),
-                    adjust_offsets_fn{*d_input_offsets, d_fill_str, offset});
+  // build the output offsets by computing the sizes of each output row
+  auto const d_input = column_device_view::create(input.parent(), stream);
+  auto sizes_itr     = cudf::detail::make_counting_transform_iterator(
+    0, output_sizes_fn{*d_input, d_fill_str, offset});
+  auto [offsets_column, total_bytes] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto offsets_view = offsets_column->view();
 
   // compute the shift-offset for the output characters child column
   auto const shift_offset = [&] {
-    auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset;
-    return (offset < 0 ? -1 : 1) *
-           cudf::detail::get_value<size_type>(offsets_column->view(), index, stream);
+    auto const index = (offset < 0) ? input.size() + offset : offset;
+    return (offset < 0 ? -1 : 1) * cudf::detail::get_value<size_type>(offsets_view, index, stream);
   }();
 
   // create output chars child column
-  auto const chars_size =
-    cudf::detail::get_value<size_type>(offsets_column->view(), offsets_size - 1, stream);
-  auto chars_column = create_chars_child_column(chars_size, stream, mr);
+  auto chars_column = create_chars_child_column(static_cast<size_type>(total_bytes), stream, mr);
   auto d_chars      = mutable_column_device_view::create(chars_column->mutable_view(), stream);
-  auto const d_input_chars = column_device_view::create(input.chars(), stream);
 
-  // run kernel to shift the characters
+  // run kernel to shift all the characters
   thrust::transform(rmm::exec_policy(stream),
                     thrust::counting_iterator<size_type>(0),
-                    thrust::counting_iterator<size_type>(chars_size),
+                    thrust::counting_iterator<size_type>(total_bytes),
                     d_chars->data<char>(),
-                    shift_chars_fn{*d_input_chars, d_fill_str, shift_offset});
+                    shift_chars_fn{*d_input, d_fill_str, shift_offset});
 
   // caller sets the null-mask
   return make_strings_column(
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index 9ca07040175..17e56ea8ed8 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -206,22 +206,48 @@ TEST_F(ShiftTests, StringsShiftTest)
   auto results = cudf::shift(input, 2, fill);
   auto expected_right =
     cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"}, {1, 1, 0, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_right, *results);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_right, *results);
 
   results = cudf::shift(input, -2, fill);
   auto expected_left =
     cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"}, {1, 1, 0, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_left, *results);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_left, *results);
 
   auto sliced = cudf::slice(input, {1, 4}).front();
 
   results           = cudf::shift(sliced, 1, fill);
-  auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_right, *results);
+  auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"}, {1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_right, *results);
 
   results          = cudf::shift(sliced, -1, fill);
-  auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_left, *results);
+  auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}, {1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_left, *results);
+}
+
+TEST_F(ShiftTests, StringsShiftNullFillTest)
+{
+  auto input = cudf::test::strings_column_wrapper(
+    {"a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj"});
+  auto phil = cudf::string_scalar("", false);
+
+  auto results  = cudf::shift(input, -1, phil);
+  auto expected = cudf::test::strings_column_wrapper(
+    {"b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::shift(input, 1, phil);
+  expected = cudf::test::strings_column_wrapper(
+    {"", "a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii"}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto sliced = cudf::slice(input, {5, 10}).front();
+  results     = cudf::shift(sliced, -2, phil);
+  expected = cudf::test::strings_column_wrapper({"hhhh", "iii", "jjjjj", "", ""}, {1, 1, 1, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::shift(sliced, 2, phil);
+  expected = cudf::test::strings_column_wrapper({"", "", "ff", "ggg", "hhhh"}, {0, 0, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(ShiftTests, OffsetGreaterThanSize)

From e16ed8102a7f2536d03c014b7cfb7078b1a8df8c Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz@gmail.com>
Date: Wed, 23 Aug 2023 18:16:33 +0200
Subject: [PATCH 086/230] Fixed processed bytes calculation in
 APPLY_BOOLEAN_MASK benchmark. (#13937)

Due to missing parentheses in APPLY_BOOLEAN_MASK benchmark, the number of written bytes were not multiplied by the number of iterations of this benchmark.

This patch relates to #13735.

Authors:
  - Martin Marenz (https://github.com/Blonck)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13937
---
 cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index f352d031240..a6feaf04842 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -65,7 +65,7 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns)
     (column_bytes_out + validity_bytes_out) * num_columns;  // writing columns
 
   state.SetItemsProcessed(state.iterations() * column_size * num_columns);
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * bytes_read + bytes_written);
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * (bytes_read + bytes_written));
 }
 
 }  // namespace

From 2700111e6b300cfff41b4e9137093bd22a00d1d4 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 23 Aug 2023 10:52:35 -0700
Subject: [PATCH 087/230] Use `cudf::thread_index_type` in cuIO to prevent
 overflow in row indexing (#13910)

Use wider type for indexing when rows are indexed in pattern
```
for (auto row = start_row; row < num_rows; row += block_size) {
if (is_within_bounds) ...
}
```
or
```
auto t = threadIdx.x;
auto row = block_start + t;
if (is_within_bounds) ...
```
Overflow can happen when the number of rows is so close to `max<size_type>` that adding block size pushes the index over the max.

Also sprinkled auto where increased size is not needed.
Also removed a few redundant conditions.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/13910
---
 cpp/include/cudf/detail/utilities/cuda.cuh | 27 ++++++++++++
 cpp/src/io/csv/csv_gpu.cu                  | 13 +++---
 cpp/src/io/json/legacy/json_gpu.cu         |  8 ++--
 cpp/src/io/orc/dict_enc.cu                 |  8 ++--
 cpp/src/io/orc/stats_enc.cu                | 16 +++----
 cpp/src/io/orc/stripe_data.cu              |  4 +-
 cpp/src/io/orc/writer_impl.cu              |  2 +-
 cpp/src/io/parquet/chunk_dict.cu           | 14 +++----
 cpp/src/io/parquet/page_decode.cuh         |  9 ++--
 cpp/src/io/parquet/page_enc.cu             | 14 +++----
 cpp/src/io/parquet/page_string_decode.cu   | 22 +++++-----
 cpp/src/io/text/multibyte_split.cu         | 49 ++++++++++------------
 cpp/src/io/utilities/parsing_utils.cu      |  5 ++-
 13 files changed, 108 insertions(+), 83 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 73c8969e207..c95189f1f94 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -65,6 +65,33 @@ class grid_1d {
     CUDF_EXPECTS(num_threads_per_block > 0, "num_threads_per_block must be > 0");
     CUDF_EXPECTS(num_blocks > 0, "num_blocks must be > 0");
   }
+
+  /**
+   * @brief Returns the global thread index in a 1D grid.
+   *
+   * The returned index is unique across the entire grid.
+   *
+   * @param thread_id The thread index within the block
+   * @param block_id The block index within the grid
+   * @param num_threads_per_block The number of threads per block
+   * @return thread_index_type The global thread index
+   */
+  static constexpr thread_index_type global_thread_id(thread_index_type thread_id,
+                                                      thread_index_type block_id,
+                                                      thread_index_type num_threads_per_block)
+  {
+    return thread_id + block_id * num_threads_per_block;
+  }
+
+  /**
+   * @brief Returns the global thread index of the current thread in a 1D grid.
+   *
+   * @return thread_index_type The global thread index
+   */
+  static __device__ thread_index_type global_thread_id()
+  {
+    return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
+  }
 };
 
 /**
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index ad8858c75d6..248e17669bc 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -20,6 +20,7 @@
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/parsing_utils.cuh>
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
@@ -45,6 +46,7 @@
 using namespace ::cudf::io;
 
 using cudf::device_span;
+using cudf::detail::grid_1d;
 
 namespace cudf {
 namespace io {
@@ -177,11 +179,10 @@ __global__ void __launch_bounds__(csvparse_block_dim)
 
   // ThreadIds range per block, so also need the blockId
   // This is entry into the fields; threadId is an element within `num_records`
-  long const rec_id      = threadIdx.x + (blockDim.x * blockIdx.x);
-  long const rec_id_next = rec_id + 1;
+  auto const rec_id      = grid_1d::global_thread_id();
+  auto const rec_id_next = rec_id + 1;
 
-  // we can have more threads than data, make sure we are not past the end of
-  // the data
+  // we can have more threads than data, make sure we are not past the end of the data
   if (rec_id_next >= row_offsets.size()) { return; }
 
   auto field_start   = raw_csv + row_offsets[rec_id];
@@ -317,8 +318,8 @@ __global__ void __launch_bounds__(csvparse_block_dim)
   auto const raw_csv = data.data();
   // thread IDs range per block, so also need the block id.
   // this is entry into the field array - tid is an elements within the num_entries array
-  long const rec_id      = threadIdx.x + (blockDim.x * blockIdx.x);
-  long const rec_id_next = rec_id + 1;
+  auto const rec_id      = grid_1d::global_thread_id();
+  auto const rec_id_next = rec_id + 1;
 
   // we can have more threads than data, make sure we are not past the end of the data
   if (rec_id_next >= row_offsets.size()) return;
diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
index d28d5614591..b358cc2071b 100644
--- a/cpp/src/io/json/legacy/json_gpu.cu
+++ b/cpp/src/io/json/legacy/json_gpu.cu
@@ -19,6 +19,7 @@
 #include <io/utilities/column_type_histogram.hpp>
 #include <io/utilities/parsing_utils.cuh>
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/types.hpp>
@@ -44,6 +45,7 @@
 #include <thrust/pair.h>
 
 using cudf::device_span;
+using cudf::detail::grid_1d;
 
 namespace cudf::io::json::detail::legacy {
 
@@ -252,7 +254,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
                                                device_span<bitmask_type* const> const valid_fields,
                                                device_span<cudf::size_type> const num_valid_fields)
 {
-  auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
+  auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
 
   auto const row_data_range = get_row_data_range(data, row_offsets, rec_id);
@@ -327,7 +329,7 @@ __global__ void detect_data_types_kernel(
   int num_columns,
   device_span<cudf::io::column_type_histogram> const column_infos)
 {
-  auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
+  auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
 
   auto const are_rows_objects = col_map.capacity() != 0;
@@ -485,7 +487,7 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
                                          unsigned long long int* keys_cnt,
                                          thrust::optional<mutable_table_device_view> keys_info)
 {
-  auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
+  auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
 
   auto const row_data_range = get_row_data_range(data, row_offsets, rec_id);
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index c069cb67cec..0007530a5af 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -130,7 +130,7 @@ __global__ void __launch_bounds__(block_size)
   size_type entry_count{0};
   size_type char_count{0};
   // all threads should loop the same number of times
-  for (auto cur_row = start_row + t; cur_row - t < end_row; cur_row += block_size) {
+  for (thread_index_type cur_row = start_row + t; cur_row - t < end_row; cur_row += block_size) {
     auto const is_valid = cur_row < end_row and col.is_valid(cur_row);
 
     if (is_valid) {
@@ -215,11 +215,9 @@ __global__ void __launch_bounds__(block_size)
                                          cuco::empty_key{KEY_SENTINEL},
                                          cuco::empty_value{VALUE_SENTINEL});
 
-  auto cur_row = start_row + t;
+  thread_index_type cur_row = start_row + t;
   while (cur_row < end_row) {
-    auto const is_valid = cur_row < col.size() and col.is_valid(cur_row);
-
-    if (is_valid) {
+    if (col.is_valid(cur_row)) {
       auto const hash_fn     = hash_functor{col};
       auto const equality_fn = equality_functor{col};
       auto const found_slot  = map.find(cur_row, hash_fn, equality_fn);
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 8fada7d5d72..069841980c1 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -36,9 +36,9 @@ __global__ void __launch_bounds__(init_threads_per_block)
                              device_2dspan<rowgroup_rows const> rowgroup_bounds)
 {
   __shared__ __align__(4) statistics_group group_g[init_groups_per_block];
-  uint32_t const col_id    = blockIdx.y;
-  uint32_t const chunk_id  = (blockIdx.x * init_groups_per_block) + threadIdx.y;
-  uint32_t const t         = threadIdx.x;
+  auto const col_id        = blockIdx.y;
+  auto const chunk_id      = (blockIdx.x * init_groups_per_block) + threadIdx.y;
+  auto const t             = threadIdx.x;
   auto const num_rowgroups = rowgroup_bounds.size().first;
   statistics_group* group  = &group_g[threadIdx.y];
   if (chunk_id < num_rowgroups and t == 0) {
@@ -75,11 +75,11 @@ __global__ void __launch_bounds__(block_size, 1)
   using block_scan = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
   __shared__ typename block_scan::TempStorage temp_storage;
   volatile uint32_t stats_size = 0;
-  uint32_t t                   = threadIdx.x;
+  auto t                       = threadIdx.x;
   __syncthreads();
-  for (uint32_t start = 0; start < statistics_count; start += block_size) {
+  for (thread_index_type start = 0; start < statistics_count; start += block_size) {
     uint32_t stats_len = 0, stats_pos;
-    uint32_t idx       = start + t;
+    auto idx           = start + t;
     if (idx < statistics_count) {
       statistics_dtype const dtype = groups[idx].stats_dtype;
       switch (dtype) {
@@ -222,8 +222,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
                         uint32_t statistics_count)
 {
   __shared__ __align__(8) stats_state_s state_g[encode_chunks_per_block];
-  uint32_t t             = threadIdx.x;
-  uint32_t idx           = blockIdx.x * encode_chunks_per_block + threadIdx.y;
+  auto t                 = threadIdx.x;
+  auto idx               = blockIdx.x * encode_chunks_per_block + threadIdx.y;
   stats_state_s* const s = &state_g[threadIdx.y];
 
   // Encode and update actual bfr size
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 592233c2418..b66ca827119 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1206,7 +1206,7 @@ __global__ void __launch_bounds__(block_size)
       if (row_in < first_row && t < 32) {
         uint32_t skippedrows = min(static_cast<uint32_t>(first_row - row_in), nrows);
         uint32_t skip_count  = 0;
-        for (uint32_t i = t * 32; i < skippedrows; i += 32 * 32) {
+        for (thread_index_type i = t * 32; i < skippedrows; i += 32 * 32) {
           // Need to arrange the bytes to apply mask properly.
           uint32_t bits = (i + 32 <= skippedrows) ? s->vals.u32[i >> 5]
                                                   : (__byte_perm(s->vals.u32[i >> 5], 0, 0x0123) &
@@ -1435,7 +1435,7 @@ __global__ void __launch_bounds__(block_size)
     s->top.data.end_row        = s->chunk.start_row + s->chunk.num_rows;
     s->top.data.buffered_count = 0;
     if (s->top.data.end_row > first_row + max_num_rows) {
-      s->top.data.end_row = static_cast<uint32_t>(first_row + max_num_rows);
+      s->top.data.end_row = first_row + max_num_rows;
     }
     if (num_rowgroups > 0) {
       s->top.data.end_row =
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 6a3c5f0134d..3d8bdb4ec97 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -372,7 +372,7 @@ __global__ void copy_string_data(char* string_pool,
     auto dst      = &string_pool[offsets[blockIdx.x]];
     auto src      = str_val.ptr;
 
-    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
+    for (thread_index_type i = threadIdx.x; i < str_val.length; i += blockDim.x) {
       dst[i] = src[i];
     }
     if (threadIdx.x == 0) { str_val.ptr = dst; }
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 72e38fd2e1c..9ff1869edde 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -36,10 +36,10 @@ template <int block_size>
 __global__ void __launch_bounds__(block_size)
   initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
 {
-  auto chunk = chunks[blockIdx.x];
-  auto t     = threadIdx.x;
+  auto const chunk = chunks[blockIdx.x];
+  auto const t     = threadIdx.x;
   // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
-  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
+  for (thread_index_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
       new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
       new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
@@ -131,7 +131,7 @@ __global__ void __launch_bounds__(block_size)
                                                         cuco::empty_value{VALUE_SENTINEL});
 
   __shared__ size_type total_num_dict_entries;
-  size_type val_idx = s_start_value_idx + t;
+  thread_index_type val_idx = s_start_value_idx + t;
   while (val_idx - block_size < end_value_idx) {
     auto const is_valid =
       val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
@@ -252,11 +252,9 @@ __global__ void __launch_bounds__(block_size)
                                    cuco::empty_key{KEY_SENTINEL},
                                    cuco::empty_value{VALUE_SENTINEL});
 
-  auto val_idx = s_start_value_idx + t;
+  thread_index_type val_idx = s_start_value_idx + t;
   while (val_idx < end_value_idx) {
-    auto const is_valid = val_idx < data_col.size() and data_col.is_valid(val_idx);
-
-    if (is_valid) {
+    if (data_col.is_valid(val_idx)) {
       auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
       cudf_assert(found_slot != map.end() &&
                   "Unable to find value in map in dictionary index construction");
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index b2c09980b6e..f649eb97680 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -29,9 +29,12 @@ constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
 constexpr int decode_block_size     = 128;
 constexpr int non_zero_buffer_size  = decode_block_size * 2;
 
-constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); }
+constexpr int rolling_index(cudf::thread_index_type index)
+{
+  return index & (non_zero_buffer_size - 1);
+}
 template <int lvl_buf_size>
-constexpr int rolling_lvl_index(int index)
+constexpr int rolling_lvl_index(cudf::thread_index_type index)
 {
   return index % lvl_buf_size;
 }
@@ -339,7 +342,7 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
                                            int t)
 {
   uint8_t const* end = s->data_end;
-  int pos            = s->dict_pos;
+  int64_t pos        = s->dict_pos;
 
   while (pos < target_pos) {
     int is_literal, batch_len;
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 20993d12af8..d066b454840 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -242,9 +242,9 @@ __global__ void __launch_bounds__(block_size)
 {
   __shared__ __align__(16) frag_init_state_s state_g;
 
-  frag_init_state_s* const s              = &state_g;
-  uint32_t const t                        = threadIdx.x;
-  uint32_t const num_fragments_per_column = frag.size().second;
+  frag_init_state_s* const s          = &state_g;
+  auto const t                        = threadIdx.x;
+  auto const num_fragments_per_column = frag.size().second;
 
   if (t == 0) { s->col = col_desc[blockIdx.x]; }
   __syncthreads();
@@ -1003,7 +1003,7 @@ __global__ void __launch_bounds__(128, 8)
   } temp_storage;
 
   page_enc_state_s* const s = &state_g;
-  uint32_t t                = threadIdx.x;
+  auto const t              = threadIdx.x;
 
   if (t == 0) {
     state_g = page_enc_state_s{};
@@ -1042,7 +1042,7 @@ __global__ void __launch_bounds__(128, 8)
       while (s->rle_numvals < s->page.num_rows) {
         uint32_t rle_numvals = s->rle_numvals;
         uint32_t nrows       = min(s->page.num_rows - rle_numvals, 128);
-        uint32_t row         = s->page.start_row + rle_numvals + t;
+        auto row             = s->page.start_row + rle_numvals + t;
         // Definition level encodes validity. Checks the valid map and if it is valid, then sets the
         // def_lvl accordingly and sets it in s->vals which is then given to RleEncode to encode
         uint32_t def_lvl = [&]() {
@@ -1884,7 +1884,7 @@ __global__ void __launch_bounds__(128)
   __shared__ __align__(8) EncPage page_g;
   __shared__ __align__(8) unsigned char scratch[MIN_STATS_SCRATCH_SIZE];
 
-  uint32_t t = threadIdx.x;
+  auto const t = threadIdx.x;
 
   if (t == 0) {
     uint8_t *hdr_start, *hdr_end;
@@ -1972,7 +1972,7 @@ __global__ void __launch_bounds__(1024)
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
 
-  uint32_t t = threadIdx.x;
+  auto const t = threadIdx.x;
   uint8_t *dst, *dst_base;
   EncPage const* first_page;
   uint32_t num_pages, uncompressed_size;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 9173d408192..bcab14f76c5 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -100,12 +100,12 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi
 {
   using block_scan = cub::BlockScan<size_type, block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
-  int const t = threadIdx.x;
+  auto const t = threadIdx.x;
 
   // do a series of block sums, storing results in arr as we go
-  for (int pos = 0; pos < length; pos += block_size) {
-    int const tidx = pos + t;
-    size_type tval = tidx < length ? arr[tidx] : 0;
+  for (thread_index_type pos = 0; pos < length; pos += block_size) {
+    auto const tidx = pos + t;
+    size_type tval  = tidx < length ? arr[tidx] : 0;
     size_type block_sum;
     block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum);
     if (tidx < length) { arr[tidx] = tval; }
@@ -144,7 +144,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
     typename block_scan::TempStorage scan_storage;
   } temp_storage;
 
-  int const t = threadIdx.x;
+  auto const t = threadIdx.x;
 
   // decode batches of level stream data using rle_stream objects and use the results to
   // calculate start and end value positions in the encoded string data.
@@ -213,7 +213,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
     bool end_value_set      = false;
 
     while (processed < s->page.num_input_values) {
-      int start_val = processed;
+      thread_index_type start_val = processed;
 
       if (has_repetition) {
         decoders[level_type::REPETITION].decode_next(t);
@@ -237,8 +237,8 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
 
       // do something with the level data
       while (start_val < processed) {
-        int idx_t = start_val + t;
-        int idx   = rolling_lvl_index<lvl_buf_size>(idx_t);
+        auto const idx_t = start_val + t;
+        auto const idx   = rolling_lvl_index<lvl_buf_size>(idx_t);
 
         // get absolute thread row index
         int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0);
@@ -329,14 +329,14 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
   else {
     int num_nulls = 0;
     while (processed < s->page.num_input_values) {
-      int start_val = processed;
+      thread_index_type start_val = processed;
       processed += decoders[level_type::DEFINITION].decode_next(t);
       __syncthreads();
 
       while (start_val < processed) {
-        int idx_t = start_val + t;
+        auto const idx_t = start_val + t;
         if (idx_t < processed) {
-          int idx = rolling_lvl_index<lvl_buf_size>(idx_t);
+          auto const idx = rolling_lvl_index<lvl_buf_size>(idx_t);
           if (def_decode[idx] < max_def) { num_nulls++; }
         }
         start_val += preprocess_block_size;
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index a04c7d84463..818bbc0a18a 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
@@ -143,7 +144,7 @@ __global__ void multibyte_split_init_kernel(
   cudf::io::text::detail::scan_tile_status status =
     cudf::io::text::detail::scan_tile_status::invalid)
 {
-  auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
   if (thread_idx < num_tiles) {
     auto const tile_idx = base_tile_idx + thread_idx;
     tile_multistates.set_status(tile_idx, status);
@@ -151,19 +152,6 @@ __global__ void multibyte_split_init_kernel(
   }
 }
 
-__global__ void multibyte_split_seed_kernel(
-  cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<output_offset> tile_output_offsets,
-  multistate tile_multistate_seed,
-  output_offset tile_output_offset)
-{
-  auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (thread_idx == 0) {
-    tile_multistates.set_inclusive_prefix(-1, tile_multistate_seed);
-    tile_output_offsets.set_inclusive_prefix(-1, tile_output_offset);
-  }
-}
-
 __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   cudf::size_type base_tile_idx,
   byte_offset base_input_offset,
@@ -185,10 +173,12 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
     typename OffsetScan::TempStorage offset_scan;
   } temp_storage;
 
-  int32_t const tile_idx            = base_tile_idx + blockIdx.x;
-  int32_t const tile_input_offset   = blockIdx.x * ITEMS_PER_TILE;
-  int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD;
-  int32_t const thread_input_size   = chunk_input_chars.size() - thread_input_offset;
+  auto const tile_idx          = base_tile_idx + blockIdx.x;
+  auto const tile_input_offset = blockIdx.x * ITEMS_PER_TILE;
+  auto const thread_input_offset =
+    tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
+  auto const thread_input_size =
+    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -258,10 +248,12 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
     typename OffsetScan::TempStorage offset_scan;
   } temp_storage;
 
-  int32_t const tile_idx            = base_tile_idx + blockIdx.x;
-  int32_t const tile_input_offset   = blockIdx.x * ITEMS_PER_TILE;
-  int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD;
-  int32_t const thread_input_size   = chunk_input_chars.size() - thread_input_offset;
+  auto const tile_idx          = base_tile_idx + blockIdx.x;
+  auto const tile_input_offset = blockIdx.x * ITEMS_PER_TILE;
+  auto const thread_input_offset =
+    tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
+  auto const thread_input_size =
+    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -401,11 +393,14 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
   // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
   // would have to follow separate logic.
-  multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>(  //
-    tile_multistates,
-    tile_offsets,
-    multistate_seed,
-    0);
+  cudf::detail::device_single_thread(
+    [tm = scan_tile_state_view<multistate>(tile_multistates),
+     to = scan_tile_state_view<output_offset>(tile_offsets),
+     multistate_seed] __device__() mutable {
+      tm.set_inclusive_prefix(-1, multistate_seed);
+      to.set_inclusive_prefix(-1, 0);
+    },
+    stream);
 
   auto reader               = source.create_reader();
   auto chunk_offset         = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 3d478471833..06b86f33c85 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -94,8 +95,8 @@ __global__ void count_and_set_positions(char const* data,
                                         T* positions)
 {
   // thread IDs range per block, so also need the block id
-  uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x);
-  uint64_t const did = tid * bytes_per_find_thread;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  auto const did = tid * bytes_per_find_thread;
 
   char const* raw = (data + did);
 

From abac227113db0b7580ce7111dd8fc6242a64d1be Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 23 Aug 2023 16:03:32 -0500
Subject: [PATCH 088/230] Fix construction of `Grouping` objects (#13932)

This PR fixes multiple issues with groupby where the consequent operations on a groupby object always return incorrect results due to an inplace modification that happens in the `Groupby` constructor. To fix this, take a copy of the `by` argument whenever it is an (internal) `_Grouping` object.

- Closes #13923
- Closes #13390

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13932
---
 python/cudf/cudf/core/groupby/groupby.py | 18 +++++--
 python/cudf/cudf/core/resample.py        | 61 +++++++++++++++++++++++-
 python/cudf/cudf/tests/test_groupby.py   | 33 +++++++++++++
 python/cudf/cudf/tests/test_serialize.py | 11 +++++
 4 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 2ed9bed5b49..b0be97915f2 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+import copy
 import itertools
 import pickle
 import textwrap
@@ -261,17 +262,17 @@ def __init__(
         """
         self.obj = obj
         self._as_index = as_index
-        self._by = by
+        self._by = by.copy(deep=True) if isinstance(by, _Grouping) else by
         self._level = level
         self._sort = sort
         self._dropna = dropna
         self._group_keys = group_keys
 
-        if isinstance(by, _Grouping):
-            by._obj = self.obj
-            self.grouping = by
+        if isinstance(self._by, _Grouping):
+            self._by._obj = self.obj
+            self.grouping = self._by
         else:
-            self.grouping = _Grouping(obj, by, level)
+            self.grouping = _Grouping(obj, self._by, level)
 
     def __iter__(self):
         if isinstance(self._by, list) and len(self._by) == 1:
@@ -2547,6 +2548,13 @@ def deserialize(cls, header, frames):
         out._key_columns = key_columns
         return out
 
+    def copy(self, deep=True):
+        out = _Grouping.__new__(_Grouping)
+        out.names = copy.deepcopy(self.names)
+        out._named_columns = copy.deepcopy(self._named_columns)
+        out._key_columns = [col.copy(deep=deep) for col in self._key_columns]
+        return out
+
 
 def _is_multi_agg(aggs):
     """
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index df901f05787..eb59cf83926 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION &
+# SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION &
 # AFFILIATES. All rights reserved.  SPDX-License-Identifier:
 # Apache-2.0
 #
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pickle
+
 import numpy as np
 import pandas as pd
 
@@ -82,6 +84,30 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
             allow_non_unique=True,
         )
 
+    def serialize(self):
+        header, frames = super().serialize()
+        grouping_head, grouping_frames = self.grouping.serialize()
+        header["grouping"] = grouping_head
+        header["resampler_type"] = pickle.dumps(type(self))
+        header["grouping_frames_count"] = len(grouping_frames)
+        frames.extend(grouping_frames)
+        return header, frames
+
+    @classmethod
+    def deserialize(cls, header, frames):
+        obj_type = pickle.loads(header["obj_type"])
+        obj = obj_type.deserialize(
+            header["obj"], frames[: header["num_obj_frames"]]
+        )
+        grouping = _ResampleGrouping.deserialize(
+            header["grouping"], frames[header["num_obj_frames"] :]
+        )
+        resampler_cls = pickle.loads(header["resampler_type"])
+        out = resampler_cls.__new__(resampler_cls)
+        out.grouping = grouping
+        super().__init__(out, obj, by=grouping)
+        return out
+
 
 class DataFrameResampler(_Resampler, DataFrameGroupBy):
     pass
@@ -95,6 +121,39 @@ class _ResampleGrouping(_Grouping):
 
     bin_labels: cudf.core.index.Index
 
+    def copy(self, deep=True):
+        out = super().copy(deep=deep)
+        result = _ResampleGrouping.__new__(_ResampleGrouping)
+        result.names = out.names
+        result._named_columns = out._named_columns
+        result._key_columns = out._key_columns
+        result.bin_labels = self.bin_labels.copy(deep=deep)
+        return result
+
+    def serialize(self):
+        header, frames = super().serialize()
+        labels_head, labels_frames = self.bin_labels.serialize()
+        header["__bin_labels"] = labels_head
+        header["__bin_labels_count"] = len(labels_frames)
+        frames.extend(labels_frames)
+        return header, frames
+
+    @classmethod
+    def deserialize(cls, header, frames):
+        names = pickle.loads(header["names"])
+        _named_columns = pickle.loads(header["_named_columns"])
+        key_columns = cudf.core.column.deserialize_columns(
+            header["columns"], frames[: -header["__bin_labels_count"]]
+        )
+        out = _ResampleGrouping.__new__(_ResampleGrouping)
+        out.names = names
+        out._named_columns = _named_columns
+        out._key_columns = key_columns
+        out.bin_labels = cudf.core.index.Index.deserialize(
+            header["__bin_labels"], frames[-header["__bin_labels_count"] :]
+        )
+        return out
+
     def _handle_frequency_grouper(self, by):
         # if `by` is a time frequency grouper, we bin the key column
         # using bin intervals specified by `by.freq`, then use *that*
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index e578e1061ca..a3b205cc16b 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3340,3 +3340,36 @@ def test_group_by_pandas_sort_order(groups, sort):
             pdf.groupby(groups, sort=sort).sum(),
             df.groupby(groups, sort=sort).sum(),
         )
+
+
+def test_groupby_consecutive_operations():
+    df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
+    pdf = df.to_pandas()
+
+    gg = df.groupby("A")
+    pg = pdf.groupby("A")
+
+    actual = gg.nth(-1)
+    expected = pg.nth(-1)
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+    actual = gg.nth(0)
+    expected = pg.nth(0)
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+    actual = gg.cumsum()
+    expected = pg.cumsum()
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+    actual = gg.cumcount()
+    expected = pg.cumcount()
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+    actual = gg.cumsum()
+    expected = pg.cumsum()
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index e7f26e259c6..01dd11ad7ad 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -347,6 +347,17 @@ def test_serialize_seriesgroupby():
     assert_eq(recreated.sum(), gb.sum())
 
 
+def test_serialize_seriesresampler():
+    index = cudf.date_range(start="2001-01-01", periods=10, freq="1T")
+    sr = cudf.Series(range(10), index=index)
+    re_sampler = sr.resample("3T")
+    actual = re_sampler.sum()
+    recreated = re_sampler.__class__.deserialize(*re_sampler.serialize())
+    expected = recreated.sum()
+
+    assert_eq(actual, expected)
+
+
 def test_serialize_string_check_buffer_sizes():
     df = cudf.DataFrame({"a": ["a", "b", "cd", None]})
     expect = df.memory_usage(deep=True).loc["a"]

From c39c04d51c40969b14193c29db67bf59485362bf Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 23 Aug 2023 14:13:17 -0700
Subject: [PATCH 089/230] [FEA] Add DELTA_BINARY_PACKED decoding support to
 Parquet reader (#13637)

Part of #13501. This adds support for decoding Parquet pages that are DELTA_BINARY_PACKED.

In addition to adding delta support, this PR incorporates changes introduced in #13622, such as using a mask to determine which decoding kernels to run, and adding parameters to  the `page_state_buffers_s` struct to reduce the amount of shared memory used.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13637
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/src/io/parquet/decode_preprocess.cu       | 417 ++++++++++++++++
 cpp/src/io/parquet/delta_binary.cuh           | 294 ++++++++++++
 cpp/src/io/parquet/page_data.cu               | 453 +++---------------
 cpp/src/io/parquet/page_decode.cuh            | 167 +++----
 cpp/src/io/parquet/page_delta_decode.cu       | 176 +++++++
 cpp/src/io/parquet/page_hdr.cu                |  24 +
 cpp/src/io/parquet/page_string_decode.cu      | 175 ++-----
 cpp/src/io/parquet/page_string_utils.cuh      | 110 +++++
 cpp/src/io/parquet/parquet_gpu.hpp            |  52 ++
 cpp/src/io/parquet/reader_impl.cpp            |  40 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   7 +-
 cpp/src/io/parquet/rle_stream.cuh             |  36 +-
 .../tests/data/parquet/delta_encoding.parquet | Bin 577 -> 577 bytes
 python/cudf/cudf/tests/test_parquet.py        |  50 ++
 15 files changed, 1376 insertions(+), 627 deletions(-)
 create mode 100644 cpp/src/io/parquet/decode_preprocess.cu
 create mode 100644 cpp/src/io/parquet/delta_binary.cuh
 create mode 100644 cpp/src/io/parquet/page_delta_decode.cu
 create mode 100644 cpp/src/io/parquet/page_string_utils.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 054f3b290a3..516865e5782 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -390,10 +390,12 @@ add_library(
   src/io/orc/writer_impl.cu
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
+  src/io/parquet/decode_preprocess.cu
   src/io/parquet/page_data.cu
   src/io/parquet/chunk_dict.cu
   src/io/parquet/page_enc.cu
   src/io/parquet/page_hdr.cu
+  src/io/parquet/page_delta_decode.cu
   src/io/parquet/page_string_decode.cu
   src/io/parquet/predicate_pushdown.cpp
   src/io/parquet/reader.cpp
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
new file mode 100644
index 00000000000..8de3702bc2e
--- /dev/null
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "page_decode.cuh"
+
+#include <io/utilities/column_buffer.hpp>
+
+#include <cudf/hashing/detail/default_hash.cuh>
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/reduce.h>
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace gpu {
+
+namespace {
+
+// # of threads we're decoding with
+constexpr int preprocess_block_size = 512;
+
+// the required number of runs in shared memory we will need to provide the
+// rle_stream object
+constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<preprocess_block_size>();
+
+// the size of the rolling batch buffer
+constexpr int rolling_buf_size = LEVEL_DECODE_BUF_SIZE;
+
+using unused_state_buf = page_state_buffers_s<0, 0, 0>;
+
+/**
+ *
+ * This function expects the dictionary position to be at 0 and will traverse
+ * the entire thing.
+ *
+ * Operates on a single warp only. Expects t < 32
+ *
+ * @param s The local page info
+ * @param t Thread index
+ */
+__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
+{
+  size_type target_pos = s->num_input_values;
+  size_type str_len    = 0;
+  if (s->dict_base) {
+    auto const [new_target_pos, len] =
+      gpuDecodeDictionaryIndices<true, unused_state_buf>(s, nullptr, target_pos, t);
+    target_pos = new_target_pos;
+    str_len    = len;
+  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+    str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
+  }
+  if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; }
+  return str_len;
+}
+
+/**
+ * @brief Update output column sizes for every nesting level based on a batch
+ * of incoming decoded definition and repetition level values.
+ *
+ * If bounds_set is true, computes skipped_values and skipped_leaf_values for the
+ * page to indicate where we need to skip to based on min/max row.
+ *
+ * Operates at the block level.
+ *
+ * @param s The local page info
+ * @param target_value_count The target value count to process up to
+ * @param rep Repetition level buffer
+ * @param def Definition level buffer
+ * @param t Thread index
+ * @param bounds_set A boolean indicating whether or not min/max row bounds have been set
+ */
+template <typename level_t>
+static __device__ void gpuUpdatePageSizes(page_state_s* s,
+                                          int target_value_count,
+                                          level_t const* const rep,
+                                          level_t const* const def,
+                                          int t,
+                                          bool bounds_set)
+{
+  // max nesting depth of the column
+  int const max_depth = s->col.max_nesting_depth;
+
+  constexpr int num_warps      = preprocess_block_size / 32;
+  constexpr int max_batch_size = num_warps * 32;
+
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  // how many input level values we've processed in the page so far
+  int value_count = s->input_value_count;
+  // how many rows we've processed in the page so far
+  int row_count = s->input_row_count;
+  // how many leaf values we've processed in the page so far
+  int leaf_count = s->input_leaf_count;
+  // whether or not we need to continue checking for the first row
+  bool skipped_values_set = s->page.skipped_values >= 0;
+
+  while (value_count < target_value_count) {
+    int const batch_size = min(max_batch_size, target_value_count - value_count);
+
+    // start/end depth
+    int start_depth, end_depth, d;
+    get_nesting_bounds<rolling_buf_size, level_t>(
+      start_depth, end_depth, d, s, rep, def, value_count, value_count + batch_size, t);
+
+    // is this thread within row bounds? in the non skip_rows/num_rows case this will always
+    // be true.
+    int in_row_bounds = 1;
+
+    // if we are in the skip_rows/num_rows case, we need to check against these limits
+    if (bounds_set) {
+      // get absolute thread row index
+      int const is_new_row = start_depth == 0;
+      int thread_row_count, block_row_count;
+      block_scan(temp_storage.scan_storage)
+        .InclusiveSum(is_new_row, thread_row_count, block_row_count);
+      __syncthreads();
+
+      // get absolute thread leaf index
+      int const is_new_leaf = (d >= s->nesting_info[max_depth - 1].max_def_level);
+      int thread_leaf_count, block_leaf_count;
+      block_scan(temp_storage.scan_storage)
+        .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
+      __syncthreads();
+
+      // if this thread is in row bounds
+      int const row_index = (thread_row_count + row_count) - 1;
+      in_row_bounds =
+        (row_index >= s->row_index_lower_bound) && (row_index < (s->first_row + s->num_rows));
+
+      // if we have not set skipped values yet, see if we found the first in-bounds row
+      if (!skipped_values_set) {
+        int local_count, global_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(in_row_bounds, local_count, global_count);
+        __syncthreads();
+
+        // we found it
+        if (global_count > 0) {
+          // this is the thread that represents the first row.
+          if (local_count == 1 && in_row_bounds) {
+            s->page.skipped_values = value_count + t;
+            s->page.skipped_leaf_values =
+              leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+          }
+          skipped_values_set = true;
+        }
+      }
+
+      row_count += block_row_count;
+      leaf_count += block_leaf_count;
+    }
+
+    // increment value counts across all nesting depths
+    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
+      int const in_nesting_bounds = (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds);
+      int const count = block_reduce(temp_storage.reduce_storage).Sum(in_nesting_bounds);
+      __syncthreads();
+      if (!t) {
+        PageNestingInfo* pni = &s->page.nesting[s_idx];
+        pni->batch_size += count;
+      }
+    }
+
+    value_count += batch_size;
+  }
+
+  // update final outputs
+  if (!t) {
+    s->input_value_count = value_count;
+
+    // only used in the skip_rows/num_rows case
+    s->input_leaf_count = leaf_count;
+    s->input_row_count  = row_count;
+  }
+}
+
+/**
+ * @brief Kernel for computing per-page column size information for all nesting levels.
+ *
+ * This function will write out the size field for each level of nesting.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows
+ * @param is_base_pass Whether or not this is the base pass.  We first have to compute
+ * the full size information of every page before we come through in a second (trim) pass
+ * to determine what subset of rows in this page we should be reading
+ * @param compute_string_sizes Whether or not we should be computing string sizes
+ * (PageInfo::str_bytes) as part of the pass
+ */
+template <typename level_t>
+__global__ void __launch_bounds__(preprocess_block_size)
+  gpuComputePageSizes(PageInfo* pages,
+                      device_span<ColumnChunkDesc const> chunks,
+                      size_t min_row,
+                      size_t num_rows,
+                      bool is_base_pass,
+                      bool compute_string_sizes)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
+  PageInfo* pp          = &pages[page_idx];
+
+  // whether or not we have repetition levels (lists)
+  bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // the level stream decoders
+  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
+  __shared__ rle_run<level_t> rep_runs[rle_run_buffer_size];
+  rle_stream<level_t, preprocess_block_size> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs},
+                                                                                      {rep_runs}};
+
+  // setup page info
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, all_types_filter{}, false)) { return; }
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  // the size of the rolling batch buffer
+  int const max_batch_size = rolling_buf_size;
+  level_t* rep             = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  level_t* def             = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
+                                        s->abs_lvl_start[level_type::DEFINITION],
+                                        s->abs_lvl_end[level_type::DEFINITION],
+                                        max_batch_size,
+                                        def,
+                                        s->page.num_input_values);
+  if (has_repetition) {
+    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
+                                          s->abs_lvl_start[level_type::REPETITION],
+                                          s->abs_lvl_end[level_type::REPETITION],
+                                          max_batch_size,
+                                          rep,
+                                          s->page.num_input_values);
+  }
+  __syncthreads();
+
+  if (!t) {
+    s->page.skipped_values      = -1;
+    s->page.skipped_leaf_values = 0;
+    s->page.str_bytes           = 0;
+    s->input_row_count          = 0;
+    s->input_value_count        = 0;
+
+    // in the base pass, we're computing the number of rows, make sure we visit absolutely
+    // everything
+    if (is_base_pass) {
+      s->first_row             = 0;
+      s->num_rows              = INT_MAX;
+      s->row_index_lower_bound = -1;
+    }
+  }
+
+  // we only need to preprocess hierarchies with repetition in them (ie, hierarchies
+  // containing lists anywhere within).
+  compute_string_sizes =
+    compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
+
+  // early out optimizations:
+
+  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need
+  // to do the expensive work of traversing the level data to determine sizes.  we can just compute
+  // it directly.
+  if (!has_repetition && !compute_string_sizes) {
+    int depth = 0;
+    while (depth < s->page.num_output_nesting_levels) {
+      auto const thread_depth = depth + t;
+      if (thread_depth < s->page.num_output_nesting_levels) {
+        if (is_base_pass) { pp->nesting[thread_depth].size = pp->num_input_values; }
+        pp->nesting[thread_depth].batch_size = pp->num_input_values;
+      }
+      depth += blockDim.x;
+    }
+    return;
+  }
+
+  // in the trim pass, for anything with lists, we only need to fully process bounding pages (those
+  // at the beginning or the end of the row bounds)
+  if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) {
+    int depth = 0;
+    while (depth < s->page.num_output_nesting_levels) {
+      auto const thread_depth = depth + t;
+      if (thread_depth < s->page.num_output_nesting_levels) {
+        // if we are not a bounding page (as checked above) then we are either
+        // returning all rows/values from this page, or 0 of them
+        pp->nesting[thread_depth].batch_size =
+          (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows))
+            ? 0
+            : pp->nesting[thread_depth].size;
+      }
+      depth += blockDim.x;
+    }
+    return;
+  }
+
+  // zero sizes
+  int depth = 0;
+  while (depth < s->page.num_output_nesting_levels) {
+    auto const thread_depth = depth + t;
+    if (thread_depth < s->page.num_output_nesting_levels) {
+      s->page.nesting[thread_depth].batch_size = 0;
+    }
+    depth += blockDim.x;
+  }
+  __syncthreads();
+
+  // the core loop. decode batches of level stream data using rle_stream objects
+  // and pass the results to gpuUpdatePageSizes
+  int processed = 0;
+  while (processed < s->page.num_input_values) {
+    // TODO:  it would not take much more work to make it so that we could run both of these
+    // decodes concurrently. there are a couple of shared variables internally that would have to
+    // get dealt with but that's about it.
+    if (has_repetition) {
+      decoders[level_type::REPETITION].decode_next(t);
+      __syncthreads();
+    }
+    // the # of rep/def levels will always be the same size
+    processed += decoders[level_type::DEFINITION].decode_next(t);
+    __syncthreads();
+
+    // update page sizes
+    gpuUpdatePageSizes<level_t>(s, processed, rep, def, t, !is_base_pass);
+    __syncthreads();
+  }
+
+  // retrieve total string size.
+  // TODO: make this block-based instead of just 1 warp
+  if (compute_string_sizes) {
+    if (t < 32) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); }
+  }
+
+  // update output results:
+  // - real number of rows for the whole page
+  // - nesting sizes for the whole page
+  // - skipped value information for trimmed pages
+  // - string bytes
+  if (is_base_pass) {
+    // nesting level 0 is the root column, so the size is also the # of rows
+    if (!t) { pp->num_rows = s->page.nesting[0].batch_size; }
+
+    // store off this batch size as the "full" size
+    int depth = 0;
+    while (depth < s->page.num_output_nesting_levels) {
+      auto const thread_depth = depth + t;
+      if (thread_depth < s->page.num_output_nesting_levels) {
+        pp->nesting[thread_depth].size = pp->nesting[thread_depth].batch_size;
+      }
+      depth += blockDim.x;
+    }
+  }
+
+  if (!t) {
+    pp->skipped_values      = s->page.skipped_values;
+    pp->skipped_leaf_values = s->page.skipped_leaf_values;
+    pp->str_bytes           = s->page.str_bytes;
+  }
+}
+
+}  // anonymous namespace
+
+/**
+ * @copydoc cudf::io::parquet::gpu::ComputePageSizes
+ */
+void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                      cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                      size_t min_row,
+                      size_t num_rows,
+                      bool compute_num_rows,
+                      bool compute_string_sizes,
+                      int level_type_size,
+                      rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(preprocess_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  // computes:
+  // PageNestingInfo::size for each level of nesting, for each page.
+  // This computes the size for the entire page, not taking row bounds into account.
+  // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims"
+  // the starting and ending read values to account for these bounds.
+  if (level_type_size == 1) {
+    gpuComputePageSizes<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
+  } else {
+    gpuComputePageSizes<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
+  }
+}
+
+}  // namespace gpu
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
new file mode 100644
index 00000000000..4fc8b9cfb8e
--- /dev/null
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "page_decode.cuh"
+
+namespace cudf::io::parquet::gpu {
+
+// DELTA_XXX encoding support
+//
+// DELTA_BINARY_PACKED is used for INT32 and INT64 data types. Encoding begins with a header
+// containing a block size, number of mini-blocks in each block, total value count, and first
+// value. The first three are ULEB128 variable length ints, and the last is a zigzag ULEB128
+// varint.
+//   -- the block size is a multiple of 128
+//   -- the mini-block count is chosen so that each mini-block will contain a multiple of 32 values
+//   -- the value count includes the first value stored in the header
+//
+// It seems most Parquet encoders will stick with a block size of 128, and 4 mini-blocks of 32
+// elements each. arrow-rs will use a block size of 256 for 64-bit ints.
+//
+// Following the header are the data blocks. Each block is further divided into mini-blocks, with
+// each mini-block having its own encoding bitwidth. Each block begins with a header containing a
+// zigzag ULEB128 encoded minimum delta value, followed by an array of uint8 bitwidths, one entry
+// per mini-block. While encoding, the lowest delta value is subtracted from all the deltas in the
+// block to ensure that all encoded values are positive. The deltas for each mini-block are bit
+// packed using the same encoding as the RLE/Bit-Packing Hybrid encoder.
+//
+// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
+// columns. For each element in a sequence of strings, a prefix length from the preceding string
+// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
+// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
+// lengths, followed by the concatenated suffix data.
+
+// TODO: The delta encodings use ULEB128 integers, but for now we're only
+// using max 64 bits. Need to see what the performance impact is of using
+// __int128_t rather than int64_t.
+using uleb128_t   = uint64_t;
+using zigzag128_t = int64_t;
+
+// we decode one mini-block at a time. max mini-block size seen is 64.
+constexpr int delta_rolling_buf_size = 128;
+
+/**
+ * @brief Read a ULEB128 varint integer
+ *
+ * @param[in,out] cur The current data position, updated after the read
+ * @param[in] end The end data position
+ *
+ * @return The value read
+ */
+inline __device__ uleb128_t get_uleb128(uint8_t const*& cur, uint8_t const* end)
+{
+  uleb128_t v = 0, l = 0, c;
+  while (cur < end) {
+    c = *cur++;
+    v |= (c & 0x7f) << l;
+    l += 7;
+    if ((c & 0x80) == 0) { return v; }
+  }
+  return v;
+}
+
+/**
+ * @brief Read a ULEB128 zig-zag encoded varint integer
+ *
+ * @param[in,out] cur The current data position, updated after the read
+ * @param[in] end The end data position
+ *
+ * @return The value read
+ */
+inline __device__ zigzag128_t get_zz128(uint8_t const*& cur, uint8_t const* end)
+{
+  uleb128_t u = get_uleb128(cur, end);
+  return static_cast<zigzag128_t>((u >> 1u) ^ -static_cast<zigzag128_t>(u & 1));
+}
+
+struct delta_binary_decoder {
+  uint8_t const* block_start;    // start of data, but updated as data is read
+  uint8_t const* block_end;      // end of data
+  uleb128_t block_size;          // usually 128, must be multiple of 128
+  uleb128_t mini_block_count;    // usually 4, chosen such that block_size/mini_block_count is a
+                                 // multiple of 32
+  uleb128_t value_count;         // total values encoded in the block
+  zigzag128_t last_value;        // last value decoded, initialized to first_value from header
+
+  uint32_t values_per_mb;        // block_size / mini_block_count, must be multiple of 32
+  uint32_t current_value_idx;    // current value index, initialized to 0 at start of block
+
+  zigzag128_t cur_min_delta;     // min delta for the block
+  uint32_t cur_mb;               // index of the current mini-block within the block
+  uint8_t const* cur_mb_start;   // pointer to the start of the current mini-block data
+  uint8_t const* cur_bitwidths;  // pointer to the bitwidth array in the block
+
+  uleb128_t value[delta_rolling_buf_size];  // circular buffer of delta values
+
+  // returns the number of values encoded in the block data. when all_values is true,
+  // account for the first value in the header. otherwise just count the values encoded
+  // in the mini-block data.
+  constexpr uint32_t num_encoded_values(bool all_values)
+  {
+    return value_count == 0 ? 0 : all_values ? value_count : value_count - 1;
+  }
+
+  // read mini-block header into state object. should only be called from init_binary_block or
+  // setup_next_mini_block. header format is:
+  //
+  // | min delta (int) | bit-width array (1 byte * mini_block_count) |
+  //
+  // on exit db->cur_mb is 0 and db->cur_mb_start points to the first mini-block of data, or
+  // nullptr if out of data.
+  // is_decode indicates whether this is being called from initialization code (false) or
+  // the actual decoding (true)
+  inline __device__ void init_mini_block(bool is_decode)
+  {
+    cur_mb       = 0;
+    cur_mb_start = nullptr;
+
+    if (current_value_idx < num_encoded_values(is_decode)) {
+      auto d_start  = block_start;
+      cur_min_delta = get_zz128(d_start, block_end);
+      cur_bitwidths = d_start;
+
+      d_start += mini_block_count;
+      cur_mb_start = d_start;
+    }
+  }
+
+  // read delta binary header into state object. should be called on thread 0. header format is:
+  //
+  // | block size (uint) | mini-block count (uint) | value count (uint) | first value (int) |
+  //
+  // also initializes the first mini-block before exit
+  inline __device__ void init_binary_block(uint8_t const* d_start, uint8_t const* d_end)
+  {
+    block_end        = d_end;
+    block_size       = get_uleb128(d_start, d_end);
+    mini_block_count = get_uleb128(d_start, d_end);
+    value_count      = get_uleb128(d_start, d_end);
+    last_value       = get_zz128(d_start, d_end);
+
+    current_value_idx = 0;
+    values_per_mb     = block_size / mini_block_count;
+
+    // init the first mini-block
+    block_start = d_start;
+
+    // only call init if there are actually encoded values
+    if (value_count > 1) { init_mini_block(false); }
+  }
+
+  // skip to the start of the next mini-block. should only be called on thread 0.
+  // calls init_binary_block if currently on the last mini-block in a block.
+  // is_decode indicates whether this is being called from initialization code (false) or
+  // the actual decoding (true)
+  inline __device__ void setup_next_mini_block(bool is_decode)
+  {
+    if (current_value_idx >= num_encoded_values(is_decode)) { return; }
+
+    current_value_idx += values_per_mb;
+
+    // just set pointer to start of next mini_block
+    if (cur_mb < mini_block_count - 1) {
+      cur_mb_start += cur_bitwidths[cur_mb] * values_per_mb / 8;
+      cur_mb++;
+    }
+    // out of mini-blocks, start a new block
+    else {
+      block_start = cur_mb_start + cur_bitwidths[cur_mb] * values_per_mb / 8;
+      init_mini_block(is_decode);
+    }
+  }
+
+  // decode the current mini-batch of deltas, and convert to values.
+  // called by all threads in a warp, currently only one warp supported.
+  inline __device__ void calc_mini_block_values(int lane_id)
+  {
+    using cudf::detail::warp_size;
+    if (current_value_idx >= value_count) { return; }
+
+    // need to save first value from header on first pass
+    if (current_value_idx == 0) {
+      if (lane_id == 0) {
+        current_value_idx++;
+        value[0] = last_value;
+      }
+      __syncwarp();
+      if (current_value_idx >= value_count) { return; }
+    }
+
+    uint32_t const mb_bits = cur_bitwidths[cur_mb];
+
+    // need to do in multiple passes if values_per_mb != 32
+    uint32_t const num_pass = values_per_mb / warp_size;
+
+    auto d_start = cur_mb_start;
+
+    for (int i = 0; i < num_pass; i++) {
+      // position at end of the current mini-block since the following calculates
+      // negative indexes
+      d_start += (warp_size * mb_bits) / 8;
+
+      // unpack deltas. modified from version in gpuDecodeDictionaryIndices(), but
+      // that one only unpacks up to bitwidths of 24. simplified some since this
+      // will always do batches of 32.
+      // NOTE: because this needs to handle up to 64 bits, the branching used in the other
+      // implementation has been replaced with a loop. While this uses more registers, the
+      // looping version is just as fast and easier to read. Might need to revisit this when
+      // DELTA_BYTE_ARRAY is implemented.
+      zigzag128_t delta = 0;
+      if (lane_id + current_value_idx < value_count) {
+        int32_t ofs      = (lane_id - warp_size) * mb_bits;
+        uint8_t const* p = d_start + (ofs >> 3);
+        ofs &= 7;
+        if (p < block_end) {
+          uint32_t c = 8 - ofs;  // 0 - 7 bits
+          delta      = (*p++) >> ofs;
+
+          while (c < mb_bits && p < block_end) {
+            delta |= static_cast<zigzag128_t>(*p++) << c;
+            c += 8;
+          }
+          delta &= (static_cast<zigzag128_t>(1) << mb_bits) - 1;
+        }
+      }
+
+      // add min delta to get true delta
+      delta += cur_min_delta;
+
+      // do inclusive scan to get value - first_value at each position
+      __shared__ cub::WarpScan<int64_t>::TempStorage temp_storage;
+      cub::WarpScan<int64_t>(temp_storage).InclusiveSum(delta, delta);
+
+      // now add first value from header or last value from previous block to get true value
+      delta += last_value;
+      int const value_idx =
+        rolling_index<delta_rolling_buf_size>(current_value_idx + warp_size * i + lane_id);
+      value[value_idx] = delta;
+
+      // save value from last lane in warp. this will become the 'first value' added to the
+      // deltas calculated in the next iteration (or invocation).
+      if (lane_id == warp_size - 1) { last_value = delta; }
+      __syncwarp();
+    }
+  }
+
+  // decodes and skips values until the block containing the value after `skip` is reached.
+  // called by all threads in a thread block.
+  inline __device__ void skip_values(int skip)
+  {
+    using cudf::detail::warp_size;
+    int const t       = threadIdx.x;
+    int const lane_id = t % warp_size;
+
+    while (current_value_idx < skip && current_value_idx < num_encoded_values(true)) {
+      if (t < warp_size) {
+        calc_mini_block_values(lane_id);
+        if (lane_id == 0) { setup_next_mini_block(true); }
+      }
+      __syncthreads();
+    }
+  }
+
+  // decodes the current mini block and stores the values obtained. should only be called by
+  // a single warp.
+  inline __device__ void decode_batch()
+  {
+    using cudf::detail::warp_size;
+    int const t       = threadIdx.x;
+    int const lane_id = t % warp_size;
+
+    // unpack deltas and save in db->value
+    calc_mini_block_values(lane_id);
+
+    // set up for next mini-block
+    if (lane_id == 0) { setup_next_mini_block(true); }
+  }
+};
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index a870d973dc1..e30df2d1f98 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -20,6 +20,9 @@
 
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 
+#include <rmm/exec_policy.hpp>
+#include <thrust/reduce.h>
+
 namespace cudf {
 namespace io {
 namespace parquet {
@@ -27,6 +30,9 @@ namespace gpu {
 
 namespace {
 
+constexpr int decode_block_size = 128;
+constexpr int rolling_buf_size  = decode_block_size * 2;
+
 /**
  * @brief Output a string descriptor
  *
@@ -35,8 +41,9 @@ namespace {
  * @param[in] src_pos Source position
  * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash)
  */
+template <typename state_buf>
 inline __device__ void gpuOutputString(volatile page_state_s* s,
-                                       volatile page_state_buffers_s* sb,
+                                       volatile state_buf* sb,
                                        int src_pos,
                                        void* dstv)
 {
@@ -63,11 +70,10 @@ inline __device__ void gpuOutputString(volatile page_state_s* s,
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-inline __device__ void gpuOutputBoolean(volatile page_state_buffers_s* sb,
-                                        int src_pos,
-                                        uint8_t* dst)
+template <typename state_buf>
+inline __device__ void gpuOutputBoolean(volatile state_buf* sb, int src_pos, uint8_t* dst)
 {
-  *dst = sb->dict_idx[rolling_index(src_pos)];
+  *dst = sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)];
 }
 
 /**
@@ -138,8 +144,9 @@ inline __device__ void gpuStoreOutput(uint2* dst,
  * @param[in] src_pos Source position
  * @param[out] dst Pointer to row output data
  */
+template <typename state_buf>
 inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
-                                               volatile page_state_buffers_s* sb,
+                                               volatile state_buf* sb,
                                                int src_pos,
                                                int64_t* dst)
 {
@@ -150,8 +157,9 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
 
   if (s->dict_base) {
     // Dictionary
-    dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0;
-    src8     = s->dict_base;
+    dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0;
+    src8 = s->dict_base;
   } else {
     // Plain
     dict_pos = src_pos;
@@ -211,8 +219,9 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
+template <typename state_buf>
 inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s,
-                                               volatile page_state_buffers_s* sb,
+                                               volatile state_buf* sb,
                                                int src_pos,
                                                int64_t* dst)
 {
@@ -222,8 +231,9 @@ inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s,
 
   if (s->dict_base) {
     // Dictionary
-    dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0;
-    src8     = s->dict_base;
+    dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0;
+    src8 = s->dict_base;
   } else {
     // Plain
     dict_pos = src_pos;
@@ -292,16 +302,18 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst)
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-template <typename T>
+template <typename T, typename state_buf>
 __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
-                                                volatile page_state_buffers_s* sb,
+                                                volatile state_buf* sb,
                                                 int src_pos,
                                                 T* dst)
 {
   uint32_t const dtype_len_in = s->dtype_len_in;
   uint8_t const* data         = s->dict_base ? s->dict_base : s->data_start;
   uint32_t const pos =
-    (s->dict_base ? ((s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0) : src_pos) *
+    (s->dict_base
+       ? ((s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0)
+       : src_pos) *
     dtype_len_in;
   uint32_t const dict_size = s->dict_size;
 
@@ -327,9 +339,9 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-template <typename T>
+template <typename T, typename state_buf>
 inline __device__ void gpuOutputFast(volatile page_state_s* s,
-                                     volatile page_state_buffers_s* sb,
+                                     volatile state_buf* sb,
                                      int src_pos,
                                      T* dst)
 {
@@ -338,8 +350,9 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s,
 
   if (s->dict_base) {
     // Dictionary
-    dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0;
-    dict     = s->dict_base;
+    dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0;
+    dict = s->dict_base;
   } else {
     // Plain
     dict_pos = src_pos;
@@ -358,16 +371,18 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s,
  * @param[in] dst8 Pointer to row output data
  * @param[in] len Length of element
  */
+template <typename state_buf>
 static __device__ void gpuOutputGeneric(
-  volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos, uint8_t* dst8, int len)
+  volatile page_state_s* s, volatile state_buf* sb, int src_pos, uint8_t* dst8, int len)
 {
   uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
 
   if (s->dict_base) {
     // Dictionary
-    dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0;
-    dict     = s->dict_base;
+    dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0;
+    dict = s->dict_base;
   } else {
     // Plain
     dict_pos = src_pos;
@@ -402,341 +417,6 @@ static __device__ void gpuOutputGeneric(
   }
 }
 
-/**
- *
- * This function expects the dictionary position to be at 0 and will traverse
- * the entire thing.
- *
- * Operates on a single warp only. Expects t < 32
- *
- * @param s The local page info
- * @param t Thread index
- */
-__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
-{
-  size_type target_pos = s->num_input_values;
-  size_type str_len    = 0;
-  if (s->dict_base) {
-    auto const [new_target_pos, len] = gpuDecodeDictionaryIndices<true>(s, nullptr, target_pos, t);
-    target_pos                       = new_target_pos;
-    str_len                          = len;
-  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
-    str_len = gpuInitStringDescriptors<true>(s, nullptr, target_pos, t);
-  }
-  if (!t) { *(volatile int32_t*)&s->dict_pos = target_pos; }
-  return str_len;
-}
-
-/**
- * @brief Update output column sizes for every nesting level based on a batch
- * of incoming decoded definition and repetition level values.
- *
- * If bounds_set is true, computes skipped_values and skipped_leaf_values for the
- * page to indicate where we need to skip to based on min/max row.
- *
- * Operates at the block level.
- *
- * @param s The local page info
- * @param target_value_count The target value count to process up to
- * @param rep Repetition level buffer
- * @param def Definition level buffer
- * @param t Thread index
- * @param bounds_set A boolean indicating whether or not min/max row bounds have been set
- */
-template <int lvl_buf_size, typename level_t>
-static __device__ void gpuUpdatePageSizes(page_state_s* s,
-                                          int target_value_count,
-                                          level_t const* const rep,
-                                          level_t const* const def,
-                                          int t,
-                                          bool bounds_set)
-{
-  // max nesting depth of the column
-  int const max_depth = s->col.max_nesting_depth;
-
-  constexpr int num_warps      = preprocess_block_size / 32;
-  constexpr int max_batch_size = num_warps * 32;
-
-  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
-  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
-
-  // how many input level values we've processed in the page so far
-  int value_count = s->input_value_count;
-  // how many rows we've processed in the page so far
-  int row_count = s->input_row_count;
-  // how many leaf values we've processed in the page so far
-  int leaf_count = s->input_leaf_count;
-  // whether or not we need to continue checking for the first row
-  bool skipped_values_set = s->page.skipped_values >= 0;
-
-  while (value_count < target_value_count) {
-    int const batch_size = min(max_batch_size, target_value_count - value_count);
-
-    // start/end depth
-    int start_depth, end_depth, d;
-    get_nesting_bounds<lvl_buf_size, level_t>(
-      start_depth, end_depth, d, s, rep, def, value_count, value_count + batch_size, t);
-
-    // is this thread within row bounds? in the non skip_rows/num_rows case this will always
-    // be true.
-    int in_row_bounds = 1;
-
-    // if we are in the skip_rows/num_rows case, we need to check against these limits
-    if (bounds_set) {
-      // get absolute thread row index
-      int const is_new_row = start_depth == 0;
-      int thread_row_count, block_row_count;
-      block_scan(temp_storage.scan_storage)
-        .InclusiveSum(is_new_row, thread_row_count, block_row_count);
-      __syncthreads();
-
-      // get absolute thread leaf index
-      int const is_new_leaf = (d >= s->nesting_info[max_depth - 1].max_def_level);
-      int thread_leaf_count, block_leaf_count;
-      block_scan(temp_storage.scan_storage)
-        .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
-      __syncthreads();
-
-      // if this thread is in row bounds
-      int const row_index = (thread_row_count + row_count) - 1;
-      in_row_bounds =
-        (row_index >= s->row_index_lower_bound) && (row_index < (s->first_row + s->num_rows));
-
-      // if we have not set skipped values yet, see if we found the first in-bounds row
-      if (!skipped_values_set) {
-        int local_count, global_count;
-        block_scan(temp_storage.scan_storage)
-          .InclusiveSum(in_row_bounds, local_count, global_count);
-        __syncthreads();
-
-        // we found it
-        if (global_count > 0) {
-          // this is the thread that represents the first row.
-          if (local_count == 1 && in_row_bounds) {
-            s->page.skipped_values = value_count + t;
-            s->page.skipped_leaf_values =
-              leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
-          }
-          skipped_values_set = true;
-        }
-      }
-
-      row_count += block_row_count;
-      leaf_count += block_leaf_count;
-    }
-
-    // increment value counts across all nesting depths
-    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
-      int const in_nesting_bounds = (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds);
-      int const count = block_reduce(temp_storage.reduce_storage).Sum(in_nesting_bounds);
-      __syncthreads();
-      if (!t) {
-        PageNestingInfo* pni = &s->page.nesting[s_idx];
-        pni->batch_size += count;
-      }
-    }
-
-    value_count += batch_size;
-  }
-
-  // update final outputs
-  if (!t) {
-    s->input_value_count = value_count;
-
-    // only used in the skip_rows/num_rows case
-    s->input_leaf_count = leaf_count;
-    s->input_row_count  = row_count;
-  }
-}
-
-/**
- * @brief Kernel for computing per-page column size information for all nesting levels.
- *
- * This function will write out the size field for each level of nesting.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows
- * @param is_base_pass Whether or not this is the base pass.  We first have to compute
- * the full size information of every page before we come through in a second (trim) pass
- * to determine what subset of rows in this page we should be reading
- * @param compute_string_sizes Whether or not we should be computing string sizes
- * (PageInfo::str_bytes) as part of the pass
- */
-template <int lvl_buf_size, typename level_t>
-__global__ void __launch_bounds__(preprocess_block_size)
-  gpuComputePageSizes(PageInfo* pages,
-                      device_span<ColumnChunkDesc const> chunks,
-                      size_t min_row,
-                      size_t num_rows,
-                      bool is_base_pass,
-                      bool compute_string_sizes)
-{
-  __shared__ __align__(16) page_state_s state_g;
-
-  page_state_s* const s = &state_g;
-  int page_idx          = blockIdx.x;
-  int t                 = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
-
-  // whether or not we have repetition levels (lists)
-  bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
-
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[run_buffer_size];
-  __shared__ rle_run<level_t> rep_runs[run_buffer_size];
-  rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
-
-  // setup page info
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, all_types_filter{}, false)) { return; }
-
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  int const max_batch_size = lvl_buf_size;
-  level_t* rep             = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
-  level_t* def             = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
-                                        s->abs_lvl_start[level_type::DEFINITION],
-                                        s->abs_lvl_end[level_type::DEFINITION],
-                                        max_batch_size,
-                                        def,
-                                        s->page.num_input_values);
-  if (has_repetition) {
-    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
-                                          s->abs_lvl_start[level_type::REPETITION],
-                                          s->abs_lvl_end[level_type::REPETITION],
-                                          max_batch_size,
-                                          rep,
-                                          s->page.num_input_values);
-  }
-  __syncthreads();
-
-  if (!t) {
-    s->page.skipped_values      = -1;
-    s->page.skipped_leaf_values = 0;
-    s->page.str_bytes           = 0;
-    s->input_row_count          = 0;
-    s->input_value_count        = 0;
-
-    // in the base pass, we're computing the number of rows, make sure we visit absolutely
-    // everything
-    if (is_base_pass) {
-      s->first_row             = 0;
-      s->num_rows              = INT_MAX;
-      s->row_index_lower_bound = -1;
-    }
-  }
-
-  // we only need to preprocess hierarchies with repetition in them (ie, hierarchies
-  // containing lists anywhere within).
-  compute_string_sizes =
-    compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
-
-  // early out optimizations:
-
-  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need
-  // to do the expensive work of traversing the level data to determine sizes.  we can just compute
-  // it directly.
-  if (!has_repetition && !compute_string_sizes) {
-    int depth = 0;
-    while (depth < s->page.num_output_nesting_levels) {
-      auto const thread_depth = depth + t;
-      if (thread_depth < s->page.num_output_nesting_levels) {
-        if (is_base_pass) { pp->nesting[thread_depth].size = pp->num_input_values; }
-        pp->nesting[thread_depth].batch_size = pp->num_input_values;
-      }
-      depth += blockDim.x;
-    }
-    return;
-  }
-
-  // in the trim pass, for anything with lists, we only need to fully process bounding pages (those
-  // at the beginning or the end of the row bounds)
-  if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) {
-    int depth = 0;
-    while (depth < s->page.num_output_nesting_levels) {
-      auto const thread_depth = depth + t;
-      if (thread_depth < s->page.num_output_nesting_levels) {
-        // if we are not a bounding page (as checked above) then we are either
-        // returning all rows/values from this page, or 0 of them
-        pp->nesting[thread_depth].batch_size =
-          (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows))
-            ? 0
-            : pp->nesting[thread_depth].size;
-      }
-      depth += blockDim.x;
-    }
-    return;
-  }
-
-  // zero sizes
-  int depth = 0;
-  while (depth < s->page.num_output_nesting_levels) {
-    auto const thread_depth = depth + t;
-    if (thread_depth < s->page.num_output_nesting_levels) {
-      s->page.nesting[thread_depth].batch_size = 0;
-    }
-    depth += blockDim.x;
-  }
-  __syncthreads();
-
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuUpdatePageSizes
-  int processed = 0;
-  while (processed < s->page.num_input_values) {
-    // TODO:  it would not take much more work to make it so that we could run both of these
-    // decodes concurrently. there are a couple of shared variables internally that would have to
-    // get dealt with but that's about it.
-    if (has_repetition) {
-      decoders[level_type::REPETITION].decode_next(t);
-      __syncthreads();
-    }
-    // the # of rep/def levels will always be the same size
-    processed += decoders[level_type::DEFINITION].decode_next(t);
-    __syncthreads();
-
-    // update page sizes
-    gpuUpdatePageSizes<lvl_buf_size>(s, processed, rep, def, t, !is_base_pass);
-    __syncthreads();
-  }
-
-  // retrieve total string size.
-  // TODO: make this block-based instead of just 1 warp
-  if (compute_string_sizes) {
-    if (t < 32) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); }
-  }
-
-  // update output results:
-  // - real number of rows for the whole page
-  // - nesting sizes for the whole page
-  // - skipped value information for trimmed pages
-  // - string bytes
-  if (is_base_pass) {
-    // nesting level 0 is the root column, so the size is also the # of rows
-    if (!t) { pp->num_rows = s->page.nesting[0].batch_size; }
-
-    // store off this batch size as the "full" size
-    int depth = 0;
-    while (depth < s->page.num_output_nesting_levels) {
-      auto const thread_depth = depth + t;
-      if (thread_depth < s->page.num_output_nesting_levels) {
-        pp->nesting[thread_depth].size = pp->nesting[thread_depth].batch_size;
-      }
-      depth += blockDim.x;
-    }
-  }
-
-  if (!t) {
-    pp->skipped_values      = s->page.skipped_values;
-    pp->skipped_leaf_values = s->page.skipped_leaf_values;
-    pp->str_bytes           = s->page.str_bytes;
-  }
-}
-
 /**
  * @brief Kernel for computing the column data stored in the pages
  *
@@ -755,17 +435,19 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s state_buffers;
+  __shared__ __align__(16)
+    page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
+      state_buffers;
 
-  page_state_s* const s          = &state_g;
-  page_state_buffers_s* const sb = &state_buffers;
-  int page_idx                   = blockIdx.x;
-  int t                          = threadIdx.x;
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
   if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, non_string_filter{chunks}, true)) {
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_GENERAL}, true)) {
     return;
   }
 
@@ -780,8 +462,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
 
-  __shared__ level_t rep[non_zero_buffer_size];  // circular buffer of repetition level values
-  __shared__ level_t def[non_zero_buffer_size];  // circular buffer of definition level values
+  __shared__ level_t rep[rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[rolling_buf_size];  // circular buffer of definition level values
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
@@ -822,7 +504,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
       src_pos += t - out_thread0;
 
       // the position in the output column/buffer
-      int dst_pos = sb->nz_idx[rolling_index(src_pos)];
+      int dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)];
 
       // for the flat hierarchy case we will be reading from the beginning of the value stream,
       // regardless of the value of first_row. so adjust our destination offset accordingly.
@@ -908,36 +590,19 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   }
 }
 
+struct mask_tform {
+  __device__ uint32_t operator()(PageInfo const& p) { return p.kernel_mask; }
+};
+
 }  // anonymous namespace
 
-/**
- * @copydoc cudf::io::parquet::gpu::ComputePageSizes
- */
-void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                      cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
-                      size_t min_row,
-                      size_t num_rows,
-                      bool compute_num_rows,
-                      bool compute_string_sizes,
-                      int level_type_size,
-                      rmm::cuda_stream_view stream)
+uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                       rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(preprocess_block_size, 1);
-  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-
-  // computes:
-  // PageNestingInfo::size for each level of nesting, for each page.
-  // This computes the size for the entire page, not taking row bounds into account.
-  // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims"
-  // the starting and ending read values to account for these bounds.
-  if (level_type_size == 1) {
-    gpuComputePageSizes<LEVEL_DECODE_BUF_SIZE, uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
-  } else {
-    gpuComputePageSizes<LEVEL_DECODE_BUF_SIZE, uint16_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(
-        pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
-  }
+  // determine which kernels to invoke
+  auto mask_iter = thrust::make_transform_iterator(pages.d_begin(), mask_tform{});
+  return thrust::reduce(
+    rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or<uint32_t>{});
 }
 
 /**
@@ -956,10 +621,10 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodePageData<non_zero_buffer_size, uint8_t>
+    gpuDecodePageData<rolling_buf_size, uint8_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   } else {
-    gpuDecodePageData<non_zero_buffer_size, uint16_t>
+    gpuDecodePageData<rolling_buf_size, uint16_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
 }
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index f649eb97680..e172382e23a 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -25,20 +25,6 @@
 
 namespace cudf::io::parquet::gpu {
 
-constexpr int preprocess_block_size = num_rle_stream_decode_threads;  // 512
-constexpr int decode_block_size     = 128;
-constexpr int non_zero_buffer_size  = decode_block_size * 2;
-
-constexpr int rolling_index(cudf::thread_index_type index)
-{
-  return index & (non_zero_buffer_size - 1);
-}
-template <int lvl_buf_size>
-constexpr int rolling_lvl_index(cudf::thread_index_type index)
-{
-  return index % lvl_buf_size;
-}
-
 struct page_state_s {
   uint8_t const* data_start;
   uint8_t const* data_end;
@@ -86,10 +72,15 @@ struct page_state_s {
 
 // buffers only used in the decode kernel.  separated from page_state_s to keep
 // shared memory usage in other kernels (eg, gpuComputePageSizes) down.
+template <int _nz_buf_size, int _dict_buf_size, int _str_buf_size>
 struct page_state_buffers_s {
-  uint32_t nz_idx[non_zero_buffer_size];    // circular buffer of non-null value positions
-  uint32_t dict_idx[non_zero_buffer_size];  // Dictionary index, boolean, or string offset values
-  uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
+  static constexpr int nz_buf_size   = _nz_buf_size;
+  static constexpr int dict_buf_size = _dict_buf_size;
+  static constexpr int str_buf_size  = _str_buf_size;
+
+  uint32_t nz_idx[nz_buf_size];      // circular buffer of non-null value positions
+  uint32_t dict_idx[dict_buf_size];  // Dictionary index, boolean, or string offset values
+  uint32_t str_len[str_buf_size];    // String length for plain encoding of strings
 };
 
 // Copies null counts back to `nesting_decode` at the end of scope
@@ -175,11 +166,14 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row
  * @param[in] s Page state input
  * @param[out] sb Page state buffer output
  * @param[in] src_pos Source position
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
  *
  * @return A pair containing a pointer to the string and its length
  */
-inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(
-  page_state_s volatile* s, page_state_buffers_s volatile* sb, int src_pos)
+template <typename state_buf>
+inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_state_s volatile* s,
+                                                                        state_buf volatile* sb,
+                                                                        int src_pos)
 {
   char const* ptr = nullptr;
   size_t len      = 0;
@@ -187,7 +181,9 @@ inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(
   if (s->dict_base) {
     // String dictionary
     uint32_t dict_pos =
-      (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0;
+      (s->dict_bits > 0)
+        ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] * sizeof(string_index_pair)
+        : 0;
     if (dict_pos < (uint32_t)s->dict_size) {
       auto const* src = reinterpret_cast<string_index_pair const*>(s->dict_base + dict_pos);
       ptr             = src->first;
@@ -195,10 +191,10 @@ inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(
     }
   } else {
     // Plain encoding
-    uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)];
+    uint32_t dict_pos = sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)];
     if (dict_pos <= (uint32_t)s->dict_size) {
       ptr = reinterpret_cast<char const*>(s->data_start + dict_pos);
-      len = sb->str_len[rolling_index(src_pos)];
+      len = sb->str_len[rolling_index<state_buf::str_buf_size>(src_pos)];
     }
   }
 
@@ -213,18 +209,17 @@ inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(
  * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to
  * 31)
  * @param[in] t Warp1 thread ID (0..31)
+ * @tparam sizes_only True if only sizes are to be calculated
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
  *
  * @return A pair containing the new output position, and the total length of strings decoded (this
  * will only be valid on thread 0 and if sizes_only is true). In the event that this function
  * decodes strings beyond target_pos, the total length of strings returned will include these
  * additional values.
  */
-template <bool sizes_only>
+template <bool sizes_only, typename state_buf>
 __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
-  page_state_s volatile* s,
-  [[maybe_unused]] page_state_buffers_s volatile* sb,
-  int target_pos,
-  int t)
+  page_state_s volatile* s, [[maybe_unused]] state_buf volatile* sb, int target_pos, int t)
 {
   uint8_t const* end = s->data_end;
   int dict_bits      = s->dict_bits;
@@ -300,7 +295,9 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
       }
 
       // if we're not computing sizes, store off the dictionary index
-      if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; }
+      if constexpr (!sizes_only) {
+        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos + t)] = dict_idx;
+      }
     }
 
     // if we're computing sizes, add the length(s)
@@ -333,11 +330,13 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
  * @param[out] sb Page state buffer output
  * @param[in] target_pos Target write position
  * @param[in] t Thread ID
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
  *
  * @return The new output position
  */
+template <typename state_buf>
 inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
-                                           page_state_buffers_s volatile* sb,
+                                           state_buf volatile* sb,
                                            int target_pos,
                                            int t)
 {
@@ -386,7 +385,7 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
       } else {
         dict_idx = s->dict_val;
       }
-      sb->dict_idx[rolling_index(pos + t)] = dict_idx;
+      sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos + t)] = dict_idx;
     }
     pos += batch_len;
   }
@@ -401,12 +400,14 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
  * @param[out] sb Page state buffer output
  * @param[in] target_pos Target output position
  * @param[in] t Thread ID
+ * @tparam sizes_only True if only sizes are to be calculated
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
  *
  * @return Total length of strings processed
  */
-template <bool sizes_only>
+template <bool sizes_only, typename state_buf>
 __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
-                                              [[maybe_unused]] page_state_buffers_s volatile* sb,
+                                              [[maybe_unused]] state_buf volatile* sb,
                                               int target_pos,
                                               int t)
 {
@@ -429,8 +430,8 @@ __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
         len = 0;
       }
       if constexpr (!sizes_only) {
-        sb->dict_idx[rolling_index(pos)] = k;
-        sb->str_len[rolling_index(pos)]  = len;
+        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
+        sb->str_len[rolling_index<state_buf::str_buf_size>(pos)]   = len;
       }
       k += len;
       total_len += len;
@@ -451,8 +452,10 @@ __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
  * @param[in] target_count Target count of stream values on output
  * @param[in] t Warp0 thread ID (0..31)
  * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ * @tparam rolling_buf_size Size of the cyclic buffer used to store value data
  */
-template <typename level_t>
+template <typename level_t, int rolling_buf_size>
 __device__ void gpuDecodeStream(
   level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
 {
@@ -519,8 +522,8 @@ __device__ void gpuDecodeStream(
       level_run -= batch_len * 2;
     }
     if (t < batch_len) {
-      int idx                    = value_count + t;
-      output[rolling_index(idx)] = level_val;
+      int idx                                      = value_count + t;
+      output[rolling_index<rolling_buf_size>(idx)] = level_val;
     }
     batch_coded_count += batch_len;
     value_count += batch_len;
@@ -541,24 +544,26 @@ __device__ void gpuDecodeStream(
  *
  * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map
  * offset is also updated
+ * @param[in,out] valid_map Pointer to bitmask to store validity information to
  * @param[in] valid_mask The validity mask to be stored
  * @param[in] value_count # of bits in the validity mask
  */
-inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
+inline __device__ void store_validity(int valid_map_offset,
+                                      bitmask_type* valid_map,
                                       uint32_t valid_mask,
                                       int32_t value_count)
 {
-  int word_offset = nesting_info->valid_map_offset / 32;
-  int bit_offset  = nesting_info->valid_map_offset % 32;
+  int word_offset = valid_map_offset / 32;
+  int bit_offset  = valid_map_offset % 32;
   // if we fit entirely in the output word
   if (bit_offset + value_count <= 32) {
     auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
 
     if (relevant_mask == ~0) {
-      nesting_info->valid_map[word_offset] = valid_mask;
+      valid_map[word_offset] = valid_mask;
     } else {
-      atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
-      atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
+      atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset));
+      atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
     }
   }
   // we're going to spill over into the next word.
@@ -572,17 +577,15 @@ inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
     // first word. strip bits_left bits off the beginning and store that
     uint32_t relevant_mask = ((1 << bits_left) - 1);
     uint32_t mask_word0    = valid_mask & relevant_mask;
-    atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
-    atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset);
+    atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset));
+    atomicOr(valid_map + word_offset, mask_word0 << bit_offset);
 
     // second word. strip the remainder of the bits off the end and store that
     relevant_mask       = ((1 << (value_count - bits_left)) - 1);
     uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left);
-    atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask));
-    atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left);
+    atomicAnd(valid_map + word_offset + 1, ~(relevant_mask));
+    atomicOr(valid_map + word_offset + 1, mask_word1 >> bits_left);
   }
-
-  nesting_info->valid_map_offset += value_count;
 }
 
 /**
@@ -599,8 +602,10 @@ inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
  * @param[in] input_value_count The current count of input level values we have processed
  * @param[in] target_input_value_count The desired # of input level values we want to process
  * @param[in] t Thread index
+ * @tparam rolling_buf_size Size of the cyclic buffer used to store value data
+ * @tparam level_t Type used to store decoded repetition and definition levels
  */
-template <int lvl_buf_size, typename level_t>
+template <int rolling_buf_size, typename level_t>
 inline __device__ void get_nesting_bounds(int& start_depth,
                                           int& end_depth,
                                           int& d,
@@ -615,7 +620,7 @@ inline __device__ void get_nesting_bounds(int& start_depth,
   end_depth   = -1;
   d           = -1;
   if (input_value_count + t < target_input_value_count) {
-    int const index = rolling_lvl_index<lvl_buf_size>(input_value_count + t);
+    int const index = rolling_index<rolling_buf_size>(input_value_count + t);
     d               = static_cast<int>(def[index]);
     // if we have repetition (there are list columns involved) we have to
     // bound what nesting levels we apply values to
@@ -643,11 +648,14 @@ inline __device__ void get_nesting_bounds(int& start_depth,
  * @param[in] rep Repetition level buffer
  * @param[in] def Definition level buffer
  * @param[in] t Thread index
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ * @tparam rolling_buf_size Size of the cyclic buffer used to store value data
  */
-template <int lvl_buf_size, typename level_t>
+template <typename level_t, typename state_buf, int rolling_buf_size>
 __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
                                                       page_state_s* s,
-                                                      page_state_buffers_s* sb,
+                                                      state_buf* sb,
                                                       level_t const* const rep,
                                                       level_t const* const def,
                                                       int t)
@@ -667,7 +675,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
     // determine the nesting bounds for this thread (the range of nesting depths we
     // will generate new value indices and validity bits for)
     int start_depth, end_depth, d;
-    get_nesting_bounds<non_zero_buffer_size, level_t>(
+    get_nesting_bounds<rolling_buf_size, level_t>(
       start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
 
     // 4 interesting things to track:
@@ -734,7 +742,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
         int const src_pos = nesting_info->valid_count + thread_valid_count;
         int const dst_pos = nesting_info->value_count + thread_value_count;
         // nz_idx is a mapping of src buffer indices to destination buffer indices
-        sb->nz_idx[rolling_index(src_pos)] = dst_pos;
+        sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)] = dst_pos;
       }
 
       // compute warp and thread value counts for the -next- nesting level. we need to
@@ -779,8 +787,11 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
       if (!t) {
         if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
           uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
-          store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count);
-
+          store_validity(nesting_info->valid_map_offset,
+                         nesting_info->valid_map,
+                         warp_output_valid_mask,
+                         warp_valid_mask_bit_count);
+          nesting_info->valid_map_offset += warp_valid_mask_bit_count;
           nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
         }
         nesting_info->valid_count += warp_valid_count;
@@ -822,10 +833,13 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
  * @param[in] rep Repetition level buffer
  * @param[in] def Definition level buffer
  * @param[in] t Thread index
+ * @tparam rolling_buf_size Size of the cyclic buffer used to store value data
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
  */
-template <int lvl_buf_size, typename level_t>
+template <int rolling_buf_size, typename level_t, typename state_buf>
 __device__ void gpuDecodeLevels(page_state_s* s,
-                                page_state_buffers_s* sb,
+                                state_buf* sb,
                                 int32_t target_leaf_count,
                                 level_t* const rep,
                                 level_t* const def,
@@ -837,8 +851,10 @@ __device__ void gpuDecodeLevels(page_state_s* s,
   int cur_leaf_count       = target_leaf_count;
   while (!s->error && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
-    if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); }
-    gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION);
+    if (has_repetition) {
+      gpuDecodeStream<level_t, rolling_buf_size>(rep, s, cur_leaf_count, t, level_type::REPETITION);
+    }
+    gpuDecodeStream<level_t, rolling_buf_size>(def, s, cur_leaf_count, t, level_type::DEFINITION);
     __syncwarp();
 
     // because the rep and def streams are encoded separately, we cannot request an exact
@@ -849,7 +865,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
                                            : s->lvl_count[level_type::DEFINITION];
 
     // process what we got back
-    gpuUpdateValidityOffsetsAndRowIndices<lvl_buf_size, level_t>(
+    gpuUpdateValidityOffsetsAndRowIndices<level_t, state_buf, rolling_buf_size>(
       actual_leaf_count, s, sb, rep, def, t);
     cur_leaf_count = actual_leaf_count + batch_size;
     __syncwarp();
@@ -863,9 +879,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
  * @param[in,out] s The page state
  * @param[in] cur The current data position
  * @param[in] end The end of the data
- * @param[in] level_bits The bits required
- * @param[in] is_decode_step True if we are performing the decode step.
- * @param[in,out] decoders The repetition and definition level stream decoders
+ * @param[in] lvl Enum indicating whether this is to initialize repetition or definition level data
  *
  * @return The length of the section
  */
@@ -951,21 +965,11 @@ struct all_types_filter {
 };
 
 /**
- * @brief Functor for setupLocalPageInfo that returns true if this is not a string column.
+ * @brief Functor for setupLocalPageInfo that takes a mask of allowed types.
  */
-struct non_string_filter {
-  device_span<ColumnChunkDesc const> chunks;
-
-  __device__ inline bool operator()(PageInfo const& page) { return !is_string_col(page, chunks); }
-};
-
-/**
- * @brief Functor for setupLocalPageInfo that returns true if this is a string column.
- */
-struct string_filter {
-  device_span<ColumnChunkDesc const> chunks;
-
-  __device__ inline bool operator()(PageInfo const& page) { return is_string_col(page, chunks); }
+struct mask_filter {
+  int mask;
+  __device__ inline bool operator()(PageInfo const& page) { return (page.kernel_mask & mask) != 0; }
 };
 
 /**
@@ -978,9 +982,9 @@ struct string_filter {
  * @param[in] num_rows Maximum number of rows to read
  * @param[in] filter Filtering function used to decide which pages to operate on
  * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
- * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional.
  * @tparam Filter Function that takes a PageInfo reference and returns true if the given page should
  * be operated on Currently only used by gpuComputePageSizes step)
+ * @return True if this page should be processed further
  */
 template <typename Filter>
 inline __device__ bool setupLocalPageInfo(page_state_s* const s,
@@ -1271,6 +1275,9 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (cur + len > end) { s->error = 2; }
           s->dict_run = 0;
         } break;
+        case Encoding::DELTA_BINARY_PACKED:
+          // nothing to do, just don't error
+          break;
         default:
           s->error = 1;  // Unsupported encoding
           break;
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
new file mode 100644
index 00000000000..e79a479388f
--- /dev/null
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "delta_binary.cuh"
+#include "page_string_utils.cuh"
+#include "parquet_gpu.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/transform_scan.h>
+
+namespace cudf::io::parquet::gpu {
+
+namespace {
+
+// Decode page data that is DELTA_BINARY_PACKED encoded. This encoding is
+// only used for int32 and int64 physical types (and appears to only be used
+// with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
+// this kernel only needs 96 threads (3 warps)(for now).
+template <typename level_t>
+__global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  using cudf::detail::warp_size;
+  __shared__ __align__(16) delta_binary_decoder db_state;
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  int const lane_id     = t % warp_size;
+  auto* const db        = &db_state;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{KERNEL_MASK_DELTA_BINARY},
+                          true)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  // copying logic from gpuDecodePageData.
+  PageNestingDecodeInfo const* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[delta_rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[delta_rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
+
+  // initialize delta state
+  if (t == 0) { db->init_binary_block(s->data_start, s->data_end); }
+  __syncthreads();
+
+  auto const batch_size = db->values_per_mb;
+
+  // if skipped_leaf_values is non-zero, then we need to decode up to the first mini-block
+  // that has a value we need.
+  if (skipped_leaf_values > 0) { db->skip_values(skipped_leaf_values); }
+
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    uint32_t target_pos;
+    uint32_t const src_pos = s->src_pos;
+
+    if (t < 2 * warp_size) {  // warp0..1
+      target_pos = min(src_pos + 2 * batch_size, s->nz_count + batch_size);
+    } else {                  // warp2
+      target_pos = min(s->nz_count, src_pos + batch_size);
+    }
+    __syncthreads();
+
+    // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas.
+    // warp2 waits one cycle for warps 0/1 to produce a batch, and then stuffs values
+    // into the proper location in the output.
+    if (t < warp_size) {
+      // warp 0
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<delta_rolling_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+    } else if (t < 2 * warp_size) {
+      // warp 1
+      db->decode_batch();
+
+    } else if (src_pos < target_pos) {
+      // warp 2
+      // nesting level that is storing actual leaf values
+      int const leaf_level_index = s->col.max_nesting_depth - 1;
+
+      // process the mini-block in batches of 32
+      for (uint32_t sp = src_pos + lane_id; sp < src_pos + batch_size; sp += 32) {
+        // the position in the output column/buffer
+        int32_t dst_pos = sb->nz_idx[rolling_index<delta_rolling_buf_size>(sp)];
+
+        // handle skip_rows here. flat hierarchies can just skip up to first_row.
+        if (!has_repetition) { dst_pos -= s->first_row; }
+
+        // place value for this thread
+        if (dst_pos >= 0 && sp < target_pos) {
+          void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len;
+          switch (s->dtype_len) {
+            case 1:
+              *static_cast<int8_t*>(dst) =
+                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
+              break;
+            case 2:
+              *static_cast<int16_t*>(dst) =
+                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
+              break;
+            case 4:
+              *static_cast<int32_t*>(dst) =
+                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
+              break;
+            case 8:
+              *static_cast<int64_t*>(dst) =
+                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
+              break;
+          }
+        }
+      }
+
+      if (lane_id == 0) { s->src_pos = src_pos + batch_size; }
+    }
+    __syncthreads();
+  }
+}
+
+}  // anonymous namespace
+
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary
+ */
+void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                size_t num_rows,
+                                size_t min_row,
+                                int level_type_size,
+                                rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(96, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeDeltaBinary<uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuDecodeDeltaBinary<uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
+}
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 1fc1b8faddc..0d611643b46 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -154,6 +154,28 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
   } while (rep_cnt || struct_depth);
 }
 
+/**
+ * @brief Determine which decode kernel to run for the given page.
+ *
+ * @param page The page to decode
+ * @param chunk Column chunk the page belongs to
+ * @return `kernel_mask_bits` value for the given page
+ */
+__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page,
+                                         gpu::ColumnChunkDesc const& chunk)
+{
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
+
+  if (page.encoding == Encoding::DELTA_BINARY_PACKED) {
+    return KERNEL_MASK_DELTA_BINARY;
+  } else if (is_string_col(chunk)) {
+    return KERNEL_MASK_STRING;
+  }
+
+  // non-string, non-delta
+  return KERNEL_MASK_GENERAL;
+}
+
 /**
  * @brief Functor to set value to 32 bit integer read from byte stream
  *
@@ -370,6 +392,7 @@ __global__ void __launch_bounds__(128)
       bs->page.skipped_values      = -1;
       bs->page.skipped_leaf_values = 0;
       bs->page.str_bytes           = 0;
+      bs->page.kernel_mask         = 0;
     }
     num_values     = bs->ck.num_values;
     page_info      = bs->ck.page_info;
@@ -420,6 +443,7 @@ __global__ void __launch_bounds__(128)
           }
           bs->page.page_data = const_cast<uint8_t*>(bs->cur);
           bs->cur += bs->page.compressed_page_size;
+          bs->page.kernel_mask = kernel_mask_for_page(bs->page, bs->ck);
         } else {
           bs->cur = bs->end;
         }
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index bcab14f76c5..331cc72f119 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -15,6 +15,7 @@
  */
 
 #include "page_decode.cuh"
+#include "page_string_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/gather.cuh>
@@ -26,92 +27,10 @@ namespace gpu {
 
 namespace {
 
-// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
-// copies from src to dst in 16B chunks per thread.
-__device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
-{
-  using cudf::detail::warp_size;
-  using cudf::strings::detail::load_uint4;
-
-  constexpr size_t out_datatype_size = sizeof(uint4);
-  constexpr size_t in_datatype_size  = sizeof(uint);
-
-  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(dst) % out_datatype_size;
-  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(dst - alignment_offset);
-  auto const in_start         = src;
-
-  // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`.
-  // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`.
-  // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters
-  // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
-  // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid
-  // `load_uint4` reading beyond string boundaries.
-  // use signed int since out_end_aligned can be negative.
-  int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) /
-                                out_datatype_size * out_datatype_size -
-                              alignment_offset;
-  int64_t out_end_aligned =
-    (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
-    alignment_offset;
-
-  for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned;
-       ichar += warp_size * out_datatype_size) {
-    *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
-      load_uint4((const char*)in_start + ichar);
-  }
-
-  // Tail logic: copy characters of the current string outside
-  // `[out_start_aligned, out_end_aligned)`.
-  if (out_end_aligned <= out_start_aligned) {
-    // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
-    // entire string.
-    for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) {
-      dst[ichar] = in_start[ichar];
-    }
-  } else {
-    // Copy characters in range `[0, out_start_aligned)`.
-    if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; }
-    // Copy characters in range `[out_end_aligned, len)`.
-    int64_t ichar = out_end_aligned + lane_id;
-    if (ichar < len) { dst[ichar] = in_start[ichar]; }
-  }
-}
-
-/**
- * @brief char-parallel string copy.
- */
-__device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
-{
-  using cudf::detail::warp_size;
-  if (len > 64) {
-    wideStrcpy(dst, src, len, lane_id);
-  } else {
-    for (int i = lane_id; i < len; i += warp_size) {
-      dst[i] = src[i];
-    }
-  }
-}
-
-/**
- * @brief Perform exclusive scan on an array of any length using a single block of threads.
- */
-template <int block_size>
-__device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value)
-{
-  using block_scan = cub::BlockScan<size_type, block_size>;
-  __shared__ typename block_scan::TempStorage scan_storage;
-  auto const t = threadIdx.x;
-
-  // do a series of block sums, storing results in arr as we go
-  for (thread_index_type pos = 0; pos < length; pos += block_size) {
-    auto const tidx = pos + t;
-    size_type tval  = tidx < length ? arr[tidx] : 0;
-    size_type block_sum;
-    block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum);
-    if (tidx < length) { arr[tidx] = tval; }
-    initial_value += block_sum;
-  }
-}
+constexpr int preprocess_block_size = 512;
+constexpr int decode_block_size     = 128;
+constexpr int rolling_buf_size      = decode_block_size * 2;
+constexpr int preproc_buf_size      = LEVEL_DECODE_BUF_SIZE;
 
 /**
  * @brief Compute the start and end page value bounds for this page
@@ -126,16 +45,16 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi
  * @param has_repetition True if the schema is nested
  * @param decoders Definition and repetition level decoders
  * @return pair containing start and end value indexes
- * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
  * @tparam level_t Type used to store decoded repetition and definition levels
+ * @tparam rle_buf_size Size of the buffer used when decoding repetition and definition levels
  */
-template <int lvl_buf_size, typename level_t>
+template <typename level_t, int rle_buf_size>
 __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
                                               size_t min_row,
                                               size_t num_rows,
                                               bool is_bounds_pg,
                                               bool has_repetition,
-                                              rle_stream<level_t>* decoders)
+                                              rle_stream<level_t, rle_buf_size>* decoders)
 {
   using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
   using block_scan   = cub::BlockScan<int, preprocess_block_size>;
@@ -164,13 +83,12 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
   auto const col  = &s->col;
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  int const max_batch_size = lvl_buf_size;
-  auto const def_decode    = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  auto const rep_decode    = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  auto const def_decode = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  auto const rep_decode = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
   decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
                                         s->abs_lvl_start[level_type::DEFINITION],
                                         s->abs_lvl_end[level_type::DEFINITION],
-                                        max_batch_size,
+                                        preproc_buf_size,
                                         def_decode,
                                         s->page.num_input_values);
   // only need repetition if this is a bounds page. otherwise all we need is def level info
@@ -179,7 +97,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
     decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
                                           s->abs_lvl_start[level_type::REPETITION],
                                           s->abs_lvl_end[level_type::REPETITION],
-                                          max_batch_size,
+                                          preproc_buf_size,
                                           rep_decode,
                                           s->page.num_input_values);
   }
@@ -205,7 +123,13 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
     int row_fudge             = -1;
 
     // short circuit for no nulls
-    if (max_def == 0 && !has_repetition) { return {begin_row, end_row}; }
+    if (max_def == 0 && !has_repetition) {
+      if (t == 0) {
+        pp->num_nulls  = 0;
+        pp->num_valids = end_row - begin_row;
+      }
+      return {begin_row, end_row};
+    }
 
     int row_count           = 0;
     int leaf_count          = 0;
@@ -238,7 +162,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
       // do something with the level data
       while (start_val < processed) {
         auto const idx_t = start_val + t;
-        auto const idx   = rolling_lvl_index<lvl_buf_size>(idx_t);
+        auto const idx   = rolling_index<preproc_buf_size>(idx_t);
 
         // get absolute thread row index
         int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0);
@@ -336,7 +260,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
       while (start_val < processed) {
         auto const idx_t = start_val + t;
         if (idx_t < processed) {
-          auto const idx = rolling_lvl_index<lvl_buf_size>(idx_t);
+          auto const idx = rolling_index<preproc_buf_size>(idx_t);
           if (def_decode[idx] < max_def) { num_nulls++; }
         }
         start_val += preprocess_block_size;
@@ -540,18 +464,14 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data,
  * @param chunks All chunks to be decoded
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
- * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
-template <int lvl_buf_size, typename level_t>
+template <typename level_t>
 __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
 
-  // only count if it's a string column
-  if (not is_string_col(pages[blockIdx.x], chunks)) { return; }
-
   page_state_s* const s = &state_g;
   int const page_idx    = blockIdx.x;
   int const t           = threadIdx.x;
@@ -563,13 +483,19 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   // whether or not we have repetition levels (lists)
   bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
 
+  // the required number of runs in shared memory we will need to provide the
+  // rle_stream object
+  constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<preprocess_block_size>();
+
   // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[run_buffer_size];
-  __shared__ rle_run<level_t> rep_runs[run_buffer_size];
-  rle_stream<level_t> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}};
+  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
+  __shared__ rle_run<level_t> rep_runs[rle_run_buffer_size];
+  rle_stream<level_t, preprocess_block_size> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs},
+                                                                                      {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, string_filter{chunks}, false)) {
+  if (!setupLocalPageInfo(
+        s, pp, chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, false)) {
     return;
   }
 
@@ -587,7 +513,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
 
   // find start/end value indices
   auto const [start_value, end_value] =
-    page_bounds<lvl_buf_size>(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders);
+    page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders);
 
   // need to save num_nulls and num_valids calculated in page_bounds in this page
   if (t == 0) {
@@ -648,25 +574,26 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
- * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
-template <int lvl_buf_size, typename level_t>
+template <typename level_t>
 __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s state_buffers;
   __shared__ __align__(4) size_type last_offset;
+  __shared__ __align__(16)
+    page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
+      state_buffers;
 
-  page_state_s* const s          = &state_g;
-  page_state_buffers_s* const sb = &state_buffers;
-  int const page_idx             = blockIdx.x;
-  int const t                    = threadIdx.x;
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
   if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, string_filter{chunks}, true)) {
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, true)) {
     return;
   }
 
@@ -680,8 +607,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
   int const leaf_level_index                     = s->col.max_nesting_depth - 1;
   PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
 
-  __shared__ level_t rep[lvl_buf_size];  // circular buffer of repetition level values
-  __shared__ level_t def[lvl_buf_size];  // circular buffer of definition level values
+  __shared__ level_t rep[rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[rolling_buf_size];  // circular buffer of definition level values
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
@@ -702,7 +629,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       // - update validity vectors
       // - updates offsets (for nested columns)
       // - produces non-NULL value indices in s->nz_idx for subsequent decoding
-      gpuDecodeLevels<lvl_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+      gpuDecodeLevels<rolling_buf_size, level_t>(s, sb, target_pos, rep, def, t);
     } else if (t < out_thread0) {
       // skipped_leaf_values will always be 0 for flat hierarchies.
       uint32_t src_target_pos = target_pos + skipped_leaf_values;
@@ -721,7 +648,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
       src_pos += t - out_thread0;
 
       // the position in the output column/buffer
-      int dst_pos = sb->nz_idx[rolling_index(src_pos)];
+      int dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)];
 
       // for the flat hierarchy case we will be reading from the beginning of the value stream,
       // regardless of the value of first_row. so adjust our destination offset accordingly.
@@ -744,7 +671,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
       if (me < warp_size) {
         for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) {
-          dst_pos = sb->nz_idx[rolling_index(src_pos + i)];
+          dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos + i)];
           if (!has_repetition) { dst_pos -= s->first_row; }
 
           auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
@@ -827,10 +754,10 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
   dim3 dim_block(preprocess_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
   if (level_type_size == 1) {
-    gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE, uint8_t>
+    gpuComputePageStringSizes<uint8_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   } else {
-    gpuComputePageStringSizes<LEVEL_DECODE_BUF_SIZE, uint16_t>
+    gpuComputePageStringSizes<uint16_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
 }
@@ -851,10 +778,10 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodeStringPageData<non_zero_buffer_size, uint8_t>
+    gpuDecodeStringPageData<uint8_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   } else {
-    gpuDecodeStringPageData<non_zero_buffer_size, uint16_t>
+    gpuDecodeStringPageData<uint16_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
 }
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
new file mode 100644
index 00000000000..9395599b3ff
--- /dev/null
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/strings/detail/gather.cuh>
+
+namespace cudf::io::parquet::gpu {
+
+// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
+// copies from src to dst in 16B chunks per thread.
+inline __device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  using cudf::strings::detail::load_uint4;
+
+  constexpr size_t out_datatype_size = sizeof(uint4);
+  constexpr size_t in_datatype_size  = sizeof(uint);
+
+  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(dst) % out_datatype_size;
+  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(dst - alignment_offset);
+  auto const in_start         = src;
+
+  // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`.
+  // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`.
+  // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters
+  // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
+  // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid
+  // `load_uint4` reading beyond string boundaries.
+  // use signed int since out_end_aligned can be negative.
+  int64_t const out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) /
+                                      out_datatype_size * out_datatype_size -
+                                    alignment_offset;
+  int64_t const out_end_aligned =
+    (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
+    alignment_offset;
+
+  for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned;
+       ichar += warp_size * out_datatype_size) {
+    *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
+      load_uint4((const char*)in_start + ichar);
+  }
+
+  // Tail logic: copy characters of the current string outside
+  // `[out_start_aligned, out_end_aligned)`.
+  if (out_end_aligned <= out_start_aligned) {
+    // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
+    // entire string.
+    for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) {
+      dst[ichar] = in_start[ichar];
+    }
+  } else {
+    // Copy characters in range `[0, out_start_aligned)`.
+    if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; }
+    // Copy characters in range `[out_end_aligned, len)`.
+    int64_t ichar = out_end_aligned + lane_id;
+    if (ichar < len) { dst[ichar] = in_start[ichar]; }
+  }
+}
+
+/**
+ * @brief char-parallel string copy.
+ */
+inline __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  if (len > 64) {
+    wideStrcpy(dst, src, len, lane_id);
+  } else {
+    for (int i = lane_id; i < len; i += warp_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
+/**
+ * @brief Perform exclusive scan on an array of any length using a single block of threads.
+ */
+template <int block_size>
+__device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value)
+{
+  using block_scan = cub::BlockScan<size_type, block_size>;
+  __shared__ typename block_scan::TempStorage scan_storage;
+  int const t = threadIdx.x;
+
+  // do a series of block sums, storing results in arr as we go
+  for (int pos = 0; pos < length; pos += block_size) {
+    int const tidx = pos + t;
+    size_type tval = tidx < length ? arr[tidx] : 0;
+    size_type block_sum;
+    block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum);
+    if (tidx < length) { arr[tidx] = tval; }
+    initial_value += block_sum;
+  }
+}
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index b7a8f4e2157..fd971e342c0 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -48,6 +48,12 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 // level decode buffer size.
 constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 
+template <int rolling_size>
+constexpr int rolling_index(int index)
+{
+  return index % rolling_size;
+}
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -87,6 +93,17 @@ enum level_type {
   NUM_LEVEL_TYPES
 };
 
+/**
+ * @brief Enum of mask bits for the PageInfo kernel_mask
+ *
+ * Used to control which decode kernels to run.
+ */
+enum kernel_mask_bits {
+  KERNEL_MASK_GENERAL      = (1 << 0),  // Run catch-all decode kernel
+  KERNEL_MASK_STRING       = (1 << 1),  // Run decode kernel for string data
+  KERNEL_MASK_DELTA_BINARY = (1 << 2)   // Run decode kernel for DELTA_BINARY_PACKED data
+};
+
 /**
  * @brief Nesting information specifically needed by the decode and preprocessing
  * kernels.
@@ -203,6 +220,8 @@ struct PageInfo {
 
   // level decode buffers
   uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
+
+  uint32_t kernel_mask;
 };
 
 /**
@@ -458,6 +477,19 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
                                 rmm::cuda_stream_view stream);
 
+/**
+ * @brief Get the set of kernels that need to be invoked on these pages as a bitmask.
+ *
+ * This function performs a bitwise OR on all of the individual `kernel_mask` fields on the pages
+ * passed in.
+ *
+ * @param[in] pages List of pages to aggregate
+ * @param[in] stream CUDA stream to use
+ * @return Bitwise OR of all page `kernel_mask` values
+ */
+uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Compute page output size information.
  *
@@ -553,6 +585,26 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                           int level_type_size,
                           rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the DELTA_BINARY_PACKED column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] stream CUDA stream to use, default 0
+ */
+void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                       cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                       size_t num_rows,
+                       size_t min_row,
+                       int level_type_size,
+                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index b9f3639da79..3f58fc8d42d 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -27,7 +27,7 @@ namespace cudf::io::detail::parquet {
 
 namespace {
 
-int constexpr NUM_DECODERS       = 2;  // how many decode kernels are there to run
+int constexpr NUM_DECODERS       = 3;  // how many decode kernels are there to run
 int constexpr APPROX_NUM_THREADS = 4;  // guestimate from DaveB
 int constexpr STREAM_POOL_SIZE   = NUM_DECODERS * APPROX_NUM_THREADS;
 
@@ -58,14 +58,16 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
+  // figure out which kernels to run
+  auto const kernel_mask = GetAggregatedDecodeKernelMask(pages, _stream);
+
   // Check to see if there are any string columns present. If so, then we need to get size info
   // for each string page. This size info will be used to pre-allocate memory for the column,
   // allowing the page decoder to write string data directly to the column buffer, rather than
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto const has_strings = std::any_of(chunks.begin(), chunks.end(), gpu::is_string_col);
-
+  auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     gpu::ComputePageStringSizes(
@@ -178,16 +180,32 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_data.host_to_device_async(_stream);
   _stream.synchronize();
 
-  auto stream1 = get_stream_pool().get_stream();
-  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1);
+  auto const level_type_size = _file_itm_data.level_type_size;
+
+  // vector of launched streams
+  std::vector<rmm::cuda_stream_view> streams;
+
+  // launch string decoder
   if (has_strings) {
-    auto stream2 = get_stream_pool().get_stream();
-    chunk_nested_str_data.host_to_device_async(stream2);
-    gpu::DecodeStringPageData(
-      pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2);
-    stream2.synchronize();
+    streams.push_back(get_stream_pool().get_stream());
+    chunk_nested_str_data.host_to_device_async(streams.back());
+    gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
   }
-  stream1.synchronize();
+
+  // launch delta binary decoder
+  if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
+    streams.push_back(get_stream_pool().get_stream());
+    gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+  }
+
+  // launch the catch-all page decoder
+  if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
+    streams.push_back(get_stream_pool().get_stream());
+    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+  }
+
+  // synchronize the streams
+  std::for_each(streams.begin(), streams.end(), [](auto& stream) { stream.synchronize(); });
 
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 7cdccf0b273..bde73c3dd96 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -323,7 +323,8 @@ constexpr bool is_supported_encoding(Encoding enc)
     case Encoding::PLAIN:
     case Encoding::PLAIN_DICTIONARY:
     case Encoding::RLE:
-    case Encoding::RLE_DICTIONARY: return true;
+    case Encoding::RLE_DICTIONARY:
+    case Encoding::DELTA_BINARY_PACKED: return true;
     default: return false;
   }
 }
@@ -730,8 +731,8 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
   auto& chunks        = _file_itm_data.chunks;
 
   // Descriptors for all the chunks that make up the selected columns
-  const auto num_input_columns = _input_columns.size();
-  const auto num_chunks        = row_groups_info.size() * num_input_columns;
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
   chunks = cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(0, num_chunks, _stream);
 
   // Association between each column chunk and its source
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 473db660238..2545a074a38 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -22,16 +22,12 @@
 
 namespace cudf::io::parquet::gpu {
 
-// TODO: consider if these should be template parameters to rle_stream
-constexpr int num_rle_stream_decode_threads = 512;
-// the -1 here is for the look-ahead warp that fills in the list of runs to be decoded
-// in an overlapped manner. so if we had 16 total warps:
-// - warp 0 would be filling in batches of runs to be processed
-// - warps 1-15 would be decoding the previous batch of runs generated
-constexpr int num_rle_stream_decode_warps =
-  (num_rle_stream_decode_threads / cudf::detail::warp_size) - 1;
-constexpr int run_buffer_size = (num_rle_stream_decode_warps * 2);
-constexpr int rolling_run_index(int index) { return index % run_buffer_size; }
+template <int num_threads>
+constexpr int rle_stream_required_run_buffer_size()
+{
+  constexpr int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1;
+  return (num_rle_stream_decode_warps * 2);
+}
 
 /**
  * @brief Read a 32-bit varint integer
@@ -144,8 +140,18 @@ struct rle_run {
 };
 
 // a stream of rle_runs
-template <typename level_t>
+template <typename level_t, int decode_threads>
 struct rle_stream {
+  static constexpr int num_rle_stream_decode_threads = decode_threads;
+  // the -1 here is for the look-ahead warp that fills in the list of runs to be decoded
+  // in an overlapped manner. so if we had 16 total warps:
+  // - warp 0 would be filling in batches of runs to be processed
+  // - warps 1-15 would be decoding the previous batch of runs generated
+  static constexpr int num_rle_stream_decode_warps =
+    (num_rle_stream_decode_threads / cudf::detail::warp_size) - 1;
+
+  static constexpr int run_buffer_size = rle_stream_required_run_buffer_size<decode_threads>();
+
   int level_bits;
   uint8_t const* start;
   uint8_t const* cur;
@@ -210,7 +216,7 @@ struct rle_stream {
     // generate runs until we either run out of warps to decode them with, or
     // we cross the output limit.
     while (run_count < num_rle_stream_decode_warps && output_pos < max_count && cur < end) {
-      auto& run = runs[rolling_run_index(run_index)];
+      auto& run = runs[rolling_index<run_buffer_size>(run_index)];
 
       // Encoding::RLE
 
@@ -256,13 +262,13 @@ struct rle_stream {
     // if we've reached the value output limit on the last run
     if (output_pos >= max_count) {
       // first, see if we've spilled over
-      auto const& src       = runs[rolling_run_index(run_index - 1)];
+      auto const& src       = runs[rolling_index<run_buffer_size>(run_index - 1)];
       int const spill_count = output_pos - max_count;
 
       // a spill has occurred in the current run. spill the extra values over into the beginning of
       // the next run.
       if (spill_count > 0) {
-        auto& spill_run      = runs[rolling_run_index(run_index)];
+        auto& spill_run      = runs[rolling_index<run_buffer_size>(run_index)];
         spill_run            = src;
         spill_run.output_pos = 0;
         spill_run.remaining  = spill_count;
@@ -330,7 +336,7 @@ struct rle_stream {
         // repetition levels for one of the list benchmarks decodes in ~3ms total, while the
         // definition levels take ~11ms - the difference is entirely due to long runs in the
         // definition levels.
-        auto& run  = runs[rolling_run_index(run_start + warp_decode_id)];
+        auto& run  = runs[rolling_index<run_buffer_size>(run_start + warp_decode_id)];
         auto batch = run.next_batch(output + run.output_pos,
                                     min(run.remaining, (output_count - run.output_pos)));
         batch.decode(end, level_bits, warp_lane, warp_decode_id);
diff --git a/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet b/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet
index e129ced34f3b570ba0ae966277f2111f8f539465..29565bef4d2e79033e2631a46eebedd3292db6b7 100644
GIT binary patch
delta 28
icmX@ea*#zhz%j^BlucAlR4E2XF^DpW@@y2=V*&tHoCNOx

delta 28
icmX@ea*#zhz%j^BlucAlR4E2XF^DpWa%~jWV*&tHk_7Dl

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index a08ab211b8e..2cc4b32443d 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1286,6 +1286,56 @@ def test_parquet_reader_v2(tmpdir, simple_pdf):
     assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
 
 
+@pytest.mark.parametrize("nrows", [1, 100000])
+@pytest.mark.parametrize("add_nulls", [True, False])
+def test_delta_binary(nrows, add_nulls, tmpdir):
+    null_frequency = 0.25 if add_nulls else 0
+
+    # Create a pandas dataframe with random data of mixed types
+    arrow_table = dg.rand_dataframe(
+        dtypes_meta=[
+            {
+                "dtype": "int8",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+            },
+            {
+                "dtype": "int16",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+            },
+            {
+                "dtype": "int32",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+            },
+            {
+                "dtype": "int64",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+            },
+        ],
+        rows=nrows,
+        seed=0,
+        use_threads=False,
+    )
+    # Roundabout conversion to pandas to preserve nulls/data types
+    cudf_table = cudf.DataFrame.from_arrow(arrow_table)
+    test_pdf = cudf_table.to_pandas(nullable=True)
+    pdf_fname = tmpdir.join("pdfv2.parquet")
+    test_pdf.to_parquet(
+        pdf_fname,
+        version="2.6",
+        column_encoding="DELTA_BINARY_PACKED",
+        data_page_version="2.0",
+        engine="pyarrow",
+        use_dictionary=False,
+    )
+    cdf = cudf.read_parquet(pdf_fname)
+    pcdf = cudf.from_pandas(test_pdf)
+    assert_eq(cdf, pcdf)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From 4014ea32596ee55b9da9c67af56e1840fb54adc9 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Wed, 23 Aug 2023 15:06:30 -0700
Subject: [PATCH 090/230] Add HostMemoryAllocator interface (#13924)

Creates an interface to intercept calls to HostMemoryBuffer.allocate

Fixes NVIDIA/spark-rapids#8884

Signed-off-by: Gera Shegalov <gera@apache.org>

Authors:
  - Gera Shegalov (https://github.com/gerashegalov)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/13924
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 30 ++++----
 .../cudf/DefaultHostMemoryAllocator.java      | 36 +++++++++
 .../ai/rapids/cudf/HostMemoryAllocator.java   | 39 ++++++++++
 .../ai/rapids/cudf/JCudfSerialization.java    | 23 +++++-
 .../java/ai/rapids/cudf/PinnedMemoryPool.java |  8 +-
 java/src/main/java/ai/rapids/cudf/Table.java  | 74 +++++++++++++++----
 .../cudf/nvcomp/BatchedLZ4Compressor.java     | 10 ++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |  3 +-
 .../cudf/ColumnViewNonEmptyNullsTest.java     |  4 +-
 .../test/java/ai/rapids/cudf/CuFileTest.java  | 15 ++--
 .../java/ai/rapids/cudf/GatherMapTest.java    |  8 +-
 .../java/ai/rapids/cudf/MemoryBufferTest.java | 42 ++++++-----
 .../test/java/ai/rapids/cudf/TableTest.java   | 16 ++--
 .../ai/rapids/cudf/nvcomp/NvcompTest.java     |  8 +-
 14 files changed, 237 insertions(+), 79 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 7db40278d4e..3f3a55f0970 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -5003,7 +5003,7 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows
   /////////////////////////////////////////////////////////////////////////////
 
   private static HostColumnVectorCore copyToHostNestedHelper(
-      ColumnView deviceCvPointer) {
+      ColumnView deviceCvPointer, HostMemoryAllocator hostMemoryAllocator) {
     if (deviceCvPointer == null) {
       return null;
     }
@@ -5023,21 +5023,21 @@ private static HostColumnVectorCore copyToHostNestedHelper(
       currOffsets = deviceCvPointer.getOffsets();
       currValidity = deviceCvPointer.getValid();
       if (currData != null) {
-        hostData = HostMemoryBuffer.allocate(currData.length);
+        hostData = hostMemoryAllocator.allocate(currData.length);
         hostData.copyFromDeviceBuffer(currData);
       }
       if (currValidity != null) {
-        hostValid = HostMemoryBuffer.allocate(currValidity.length);
+        hostValid = hostMemoryAllocator.allocate(currValidity.length);
         hostValid.copyFromDeviceBuffer(currValidity);
       }
       if (currOffsets != null) {
-        hostOffsets = HostMemoryBuffer.allocate(currOffsets.length);
+        hostOffsets = hostMemoryAllocator.allocate(currOffsets.length);
         hostOffsets.copyFromDeviceBuffer(currOffsets);
       }
       int numChildren = deviceCvPointer.getNumChildren();
       for (int i = 0; i < numChildren; i++) {
         try(ColumnView childDevPtr = deviceCvPointer.getChildColumnView(i)) {
-          children.add(copyToHostNestedHelper(childDevPtr));
+          children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
         }
       }
       currNullCount = deviceCvPointer.getNullCount();
@@ -5074,7 +5074,7 @@ private static HostColumnVectorCore copyToHostNestedHelper(
   /**
    * Copy the data to the host.
    */
-  public HostColumnVector copyToHost() {
+  public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
     try (NvtxRange toHost = new NvtxRange("ensureOnHost", NvtxColor.BLUE)) {
       HostMemoryBuffer hostDataBuffer = null;
       HostMemoryBuffer hostValidityBuffer = null;
@@ -5094,16 +5094,16 @@ public HostColumnVector copyToHost() {
         getNullCount();
         if (!type.isNestedType()) {
           if (valid != null) {
-            hostValidityBuffer = HostMemoryBuffer.allocate(valid.getLength());
+            hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
             hostValidityBuffer.copyFromDeviceBuffer(valid);
           }
           if (offsets != null) {
-            hostOffsetsBuffer = HostMemoryBuffer.allocate(offsets.length);
+            hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.length);
             hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
           }
           // If a strings column is all null values there is no data buffer allocated
           if (data != null) {
-            hostDataBuffer = HostMemoryBuffer.allocate(data.length);
+            hostDataBuffer = hostMemoryAllocator.allocate(data.length);
             hostDataBuffer.copyFromDeviceBuffer(data);
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
@@ -5112,22 +5112,22 @@ public HostColumnVector copyToHost() {
           return ret;
         } else {
           if (data != null) {
-            hostDataBuffer = HostMemoryBuffer.allocate(data.length);
+            hostDataBuffer = hostMemoryAllocator.allocate(data.length);
             hostDataBuffer.copyFromDeviceBuffer(data);
           }
 
           if (valid != null) {
-            hostValidityBuffer = HostMemoryBuffer.allocate(valid.getLength());
+            hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
             hostValidityBuffer.copyFromDeviceBuffer(valid);
           }
           if (offsets != null) {
-            hostOffsetsBuffer = HostMemoryBuffer.allocate(offsets.getLength());
+            hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.getLength());
             hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
           }
           List<HostColumnVectorCore> children = new ArrayList<>();
           for (int i = 0; i < getNumChildren(); i++) {
             try (ColumnView childDevPtr = getChildColumnView(i)) {
-              children.add(copyToHostNestedHelper(childDevPtr));
+              children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
             }
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
@@ -5160,6 +5160,10 @@ public HostColumnVector copyToHost() {
     }
   }
 
+  public HostColumnVector copyToHost() {
+    return copyToHost(DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Calculate the total space required to copy the data to the host. This should be padded to
    * the alignment that the CPU requires.
diff --git a/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java b/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java
new file mode 100644
index 00000000000..98a5b00cf85
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java
@@ -0,0 +1,36 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+public class DefaultHostMemoryAllocator implements HostMemoryAllocator {
+  private static final HostMemoryAllocator INSTANCE = new DefaultHostMemoryAllocator();
+  public static HostMemoryAllocator get() {
+    return INSTANCE;
+  }
+
+  @Override
+  public HostMemoryBuffer allocate(long bytes, boolean preferPinned) {
+    return HostMemoryBuffer.allocate(bytes, preferPinned);
+  }
+
+  @Override
+  public HostMemoryBuffer allocate(long bytes) {
+    return HostMemoryBuffer.allocate(bytes);
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java b/java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java
new file mode 100644
index 00000000000..9834eb85e18
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+public interface HostMemoryAllocator {
+
+  /**
+   * Allocate memory, but be sure to close the returned buffer to avoid memory leaks.
+   * @param bytes size in bytes to allocate
+   * @param preferPinned If set to true, the pinned memory pool will be used if possible with a
+   *                    fallback to off-heap memory.  If set to false, the allocation will always
+   *                    be from off-heap memory.
+   * @return the newly created buffer
+   */
+  HostMemoryBuffer allocate(long bytes, boolean preferPinned);
+
+  /**
+   * Allocate memory, but be sure to close the returned buffer to avoid memory leaks. Pinned memory
+   * for allocations preference is up to the implementor
+   *
+   * @param bytes size in bytes to allocate
+   * @return the newly created buffer
+   */
+  HostMemoryBuffer allocate(long bytes);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index 40a22604f49..7deb5bae541 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1810,14 +1810,17 @@ public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] he
    * Concatenate multiple tables in host memory into a single host table buffer.
    * @param headers table headers corresponding to the host table buffers
    * @param dataBuffers host table buffer for each input table to be concatenated
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return host table header and buffer
    */
   public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] headers,
-                                                    HostMemoryBuffer[] dataBuffers) throws IOException {
+                                                    HostMemoryBuffer[] dataBuffers,
+                                                    HostMemoryAllocator hostMemoryAllocator
+                                                    ) throws IOException {
     ColumnBufferProvider[][] providersPerColumn = providersFrom(headers, dataBuffers);
     try {
       SerializedTableHeader combined = calcConcatHeader(providersPerColumn);
-      HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen);
+      HostMemoryBuffer hostBuffer = hostMemoryAllocator.allocate(combined.dataLen);
       try {
         try (NvtxRange range = new NvtxRange("Concat Host Side", NvtxColor.GREEN)) {
           DataWriter writer = writerFrom(hostBuffer);
@@ -1837,6 +1840,12 @@ public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] header
     }
   }
 
+    public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] headers,
+                                                      HostMemoryBuffer[] dataBuffers
+                                                      ) throws IOException {
+      return concatToHostBuffer(headers, dataBuffers, DefaultHostMemoryAllocator.get());
+    }
+
   /**
    * Deserialize a serialized contiguous table into an array of host columns.
    *
@@ -1916,12 +1925,14 @@ public static TableAndRowCountPair readTableFrom(SerializedTableHeader header,
   /**
    * Read a serialize table from the given InputStream.
    * @param in the stream to read the table data from.
+   * @param hostMemoryAllocator a host memory allocator for an intermediate host memory buffer
    * @return the deserialized table in device memory, or null if the stream has no table to read
    * from, an end of the stream at the very beginning.
    * @throws IOException on any error.
    * @throws EOFException if the data stream ended unexpectedly in the middle of processing.
    */
-  public static TableAndRowCountPair readTableFrom(InputStream in) throws IOException {
+  public static TableAndRowCountPair readTableFrom(InputStream in,
+      HostMemoryAllocator hostMemoryAllocator) throws IOException {
     DataInputStream din;
     if (in instanceof DataInputStream) {
       din = (DataInputStream) in;
@@ -1934,7 +1945,7 @@ public static TableAndRowCountPair readTableFrom(InputStream in) throws IOExcept
       return new TableAndRowCountPair(0, null);
     }
 
-    try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(header.dataLen)) {
+    try (HostMemoryBuffer hostBuffer = hostMemoryAllocator.allocate(header.dataLen)) {
       if (header.dataLen > 0) {
         readTableIntoBuffer(din, header, hostBuffer);
       }
@@ -1942,6 +1953,10 @@ public static TableAndRowCountPair readTableFrom(InputStream in) throws IOExcept
     }
   }
 
+  public static TableAndRowCountPair readTableFrom(InputStream in) throws IOException {
+    return readTableFrom(in, DefaultHostMemoryAllocator.get());
+  }
+
   /** Holds the result of deserializing a table. */
   public static final class TableAndRowCountPair implements Closeable {
     private final int numRows;
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 6eee935748e..969946a9533 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -233,14 +233,18 @@ public static HostMemoryBuffer tryAllocate(long bytes) {
    * @param bytes size in bytes to allocate
    * @return newly created buffer
    */
-  public static HostMemoryBuffer allocate(long bytes) {
+  public static HostMemoryBuffer allocate(long bytes, HostMemoryAllocator  hostMemoryAllocator) {
     HostMemoryBuffer result = tryAllocate(bytes);
     if (result == null) {
-      result = HostMemoryBuffer.allocate(bytes, false);
+      result = hostMemoryAllocator.allocate(bytes, false);
     }
     return result;
   }
 
+  public static HostMemoryBuffer allocate(long bytes) {
+    return allocate(bytes, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Get the number of bytes free in the pinned memory pool.
    *
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4eb28f48337..57189b052b6 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -821,22 +821,29 @@ public static Table readCSV(Schema schema, CSVOptions opts, byte[] buffer) {
    * @param buffer raw UTF8 formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
   public static Table readCSV(Schema schema, CSVOptions opts, byte[] buffer, long offset,
-                              long len) {
+                              long len, HostMemoryAllocator hostMemoryAllocator) {
     if (len <= 0) {
       len = buffer.length - offset;
     }
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readCSV(schema, opts, newBuf, 0, len);
     }
   }
 
+
+  public static Table readCSV(Schema schema, CSVOptions opts, byte[] buffer, long offset,
+                              long len) {
+    return readCSV(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Read CSV formatted data.
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
@@ -1038,22 +1045,28 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
    * @param buffer raw UTF8 formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
   public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
-                               long len) {
+                               long len, HostMemoryAllocator hostMemoryAllocator) {
     if (len <= 0) {
       len = buffer.length - offset;
     }
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readJSON(schema, opts, newBuf, 0, len);
     }
   }
 
+  public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
+                               long len) {
+    return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Read JSON formatted data and infer the column names and schema.
    * @param opts various JSON parsing options.
@@ -1143,21 +1156,27 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer) {
    * @param buffer raw parquet formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) {
+  public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len,
+      HostMemoryAllocator hostMemoryAllocator) {
     if (len <= 0) {
       len = buffer.length - offset;
     }
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readParquet(opts, newBuf, 0, len);
     }
   }
 
+  public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) {
+    return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Read parquet formatted data.
    * @param opts various parquet parsing options.
@@ -1223,19 +1242,26 @@ public static Table readAvro(AvroOptions opts, byte[] buffer) {
    * @param buffer raw Avro formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readAvro(AvroOptions opts, byte[] buffer, long offset, long len) {
+  public static Table readAvro(AvroOptions opts, byte[] buffer, long offset, long len,
+      HostMemoryAllocator hostMemoryAllocator) {
     assert offset >= 0 && offset < buffer.length;
     assert len <= buffer.length - offset;
     len = len > 0 ? len : buffer.length - offset;
 
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readAvro(opts, newBuf, 0, len);
     }
   }
 
+  public static Table readAvro(AvroOptions opts, byte[] buffer, long offset, long len) {
+    return readAvro(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
+
   /**
    * Read Avro formatted data.
    * @param opts various Avro parsing options.
@@ -1301,21 +1327,28 @@ public static Table readORC(ORCOptions opts, byte[] buffer) {
    * @param buffer raw ORC formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readORC(ORCOptions opts, byte[] buffer, long offset, long len) {
+  public static Table readORC(ORCOptions opts, byte[] buffer, long offset, long len,
+      HostMemoryAllocator hostMemoryAllocator) {
     if (len <= 0) {
       len = buffer.length - offset;
     }
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readORC(opts, newBuf, 0, len);
     }
   }
 
+  public static Table readORC(ORCOptions opts, byte[] buffer, long offset, long len) {
+    return readORC(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
+
   /**
    * Read ORC formatted data.
    * @param opts various ORC parsing options.
@@ -1606,10 +1639,13 @@ public static TableWriter writeArrowIPCChunked(ArrowIPCWriterOptions options,
   private static class ArrowReaderWrapper implements AutoCloseable {
     private HostBufferProvider provider;
     private HostMemoryBuffer buffer;
+    private final HostMemoryAllocator hostMemoryAllocator;
 
-    private ArrowReaderWrapper(HostBufferProvider provider) {
+    private ArrowReaderWrapper(HostBufferProvider provider,
+        HostMemoryAllocator hostMemoryAllocator) {
       this.provider = provider;
-      buffer = HostMemoryBuffer.allocate(10 * 1024 * 1024, false);
+      this.hostMemoryAllocator = hostMemoryAllocator;
+      buffer = this.hostMemoryAllocator.allocate(10 * 1024 * 1024, false);
     }
 
     // Called From JNI
@@ -1656,8 +1692,9 @@ private ArrowIPCStreamedTableReader(ArrowIPCOptions options, File inputFile) {
       this.callback = options.getCallback();
     }
 
-    private ArrowIPCStreamedTableReader(ArrowIPCOptions options, HostBufferProvider provider) {
-      this.provider = new ArrowReaderWrapper(provider);
+    private ArrowIPCStreamedTableReader(ArrowIPCOptions options, HostBufferProvider provider,
+      HostMemoryAllocator hostMemoryAllocator) {
+      this.provider = new ArrowReaderWrapper(provider, hostMemoryAllocator);
       this.handle = readArrowIPCBufferBegin(this.provider);
       this.callback = options.getCallback();
     }
@@ -1720,9 +1757,16 @@ public static StreamedTableReader readArrowIPCChunked(File inputFile) {
    * @param provider what will provide the data being read.
    * @return a reader.
    */
+
+  public static StreamedTableReader readArrowIPCChunked(ArrowIPCOptions options,
+                                                        HostBufferProvider provider,
+                                                        HostMemoryAllocator hostMemoryAllocator) {
+    return new ArrowIPCStreamedTableReader(options, provider, hostMemoryAllocator);
+  }
+
   public static StreamedTableReader readArrowIPCChunked(ArrowIPCOptions options,
                                                         HostBufferProvider provider) {
-    return new ArrowIPCStreamedTableReader(options, provider);
+    return new ArrowIPCStreamedTableReader(options, provider, DefaultHostMemoryAllocator.get());
   }
 
   /**
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
index 1ab3b97945d..1aa7e5e11a0 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
@@ -19,16 +19,18 @@
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
 import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.DefaultHostMemoryAllocator;
 import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryAllocator;
 import ai.rapids.cudf.HostMemoryBuffer;
 import ai.rapids.cudf.MemoryBuffer;
 import ai.rapids.cudf.NvtxColor;
 import ai.rapids.cudf.NvtxRange;
 
-import java.util.Arrays;
-
 /** Multi-buffer LZ4 compressor */
 public class BatchedLZ4Compressor {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   static final long MAX_CHUNK_SIZE = 16777216;  // in bytes
   // each chunk has a 64-bit integer value as metadata containing the compressed size
   static final long METADATA_BYTES_PER_CHUNK = 8;
@@ -207,7 +209,7 @@ private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
     final long outputAddrsOffset = inputAddrs.length * 8L;
     final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
     try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(totalSize);
+      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(totalSize);
            DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
         hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
         hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
@@ -224,7 +226,7 @@ private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
   // Synchronously copy the resulting compressed sizes from device memory to host memory.
   private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
     try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(devChunkSizes.getLength())) {
+      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(devChunkSizes.getLength())) {
         hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
         int numChunks = (int) (devChunkSizes.getLength() / 8);
         long[] result = new long[numChunks];
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 1062a765800..f6dffc88b92 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -6752,10 +6752,11 @@ void testColumnViewWithNonEmptyNullsIsCleared() {
     List<Integer> list1 = Arrays.asList(4, 5, null);
     List<Integer> list2 = Arrays.asList(7, 8, 9);
     List<Integer> list3 = null;
+    final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
     try (ColumnVector input = ColumnVectorTest.makeListsColumn(DType.INT32, list0, list1, list2, list3);
          BaseDeviceMemoryBuffer baseValidityBuffer = input.getDeviceBufferFor(BufferType.VALIDITY);
          BaseDeviceMemoryBuffer baseOffsetBuffer = input.getDeviceBufferFor(BufferType.OFFSET);
-         HostMemoryBuffer newValidity = HostMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(4))) {
+         HostMemoryBuffer newValidity = hostMemoryAllocator.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(4))) {
 
       newValidity.copyFromDeviceBuffer(baseValidityBuffer);
       // we are setting list1 with 3 elements to null. This will result in a non-empty null in the
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnViewNonEmptyNullsTest.java b/java/src/test/java/ai/rapids/cudf/ColumnViewNonEmptyNullsTest.java
index 45e14ef8e26..8d5351d95f4 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnViewNonEmptyNullsTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnViewNonEmptyNullsTest.java
@@ -34,6 +34,8 @@
  */
 public class ColumnViewNonEmptyNullsTest extends CudfTestBase {
 
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   @Test
   void testAndNullReconfigureNulls() {
     try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
@@ -84,7 +86,7 @@ private ColumnView[] getColumnViewWithNonEmptyNulls() {
     ColumnVector input = ColumnVectorTest.makeListsColumn(DType.INT32, list0, list1, list2, list3);
     // Modify the validity buffer
     BaseDeviceMemoryBuffer dmb = input.getDeviceBufferFor(BufferType.VALIDITY);
-    try (HostMemoryBuffer newValidity = HostMemoryBuffer.allocate(64)) {
+    try (HostMemoryBuffer newValidity = hostMemoryAllocator.allocate(64)) {
       newValidity.copyFromDeviceBuffer(dmb);
       BitVectorHelper.setNullAt(newValidity, 1);
       dmb.copyFromHostBuffer(newValidity);
diff --git a/java/src/test/java/ai/rapids/cudf/CuFileTest.java b/java/src/test/java/ai/rapids/cudf/CuFileTest.java
index 10415cae893..8945b6684d5 100644
--- a/java/src/test/java/ai/rapids/cudf/CuFileTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CuFileTest.java
@@ -27,6 +27,9 @@
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
 public class CuFileTest extends CudfTestBase {
+
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   @AfterEach
   void tearDown() {
     if (PinnedMemoryPool.isInitialized()) {
@@ -67,10 +70,10 @@ public void testAppendToExistingFile(@TempDir File tempDir) throws IOException {
   }
 
   private void verifyCopyToFile(File tempFile) {
-    try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer orig = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer dest = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer dest = hostMemoryAllocator.allocate(16)) {
       orig.setLong(0, 123456789);
       from.copyFromHostBuffer(orig);
       CuFile.writeDeviceBufferToFile(tempFile, 0, from);
@@ -81,10 +84,10 @@ private void verifyCopyToFile(File tempFile) {
   }
 
   private void verifyAppendToFile(File tempFile) {
-    try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer orig = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer dest = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer dest = hostMemoryAllocator.allocate(16)) {
       orig.setLong(0, 123456789);
       from.copyFromHostBuffer(orig);
       assertEquals(0, CuFile.appendDeviceBufferToFile(tempFile, from));
@@ -128,7 +131,7 @@ public void testReadWriteRegisteredBuffer(@TempDir File tempDir) {
   }
 
   private void verifyReadWrite(File tempFile, int length, boolean registerBuffer) {
-    try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(length);
+    try (HostMemoryBuffer orig = hostMemoryAllocator.allocate(length);
          CuFileBuffer from = CuFileBuffer.allocate(length, registerBuffer);
          CuFileWriteHandle writer = new CuFileWriteHandle(tempFile.getAbsolutePath())) {
       orig.setLong(0, 123456789);
@@ -141,7 +144,7 @@ private void verifyReadWrite(File tempFile, int length, boolean registerBuffer)
     }
     try (CuFileBuffer to = CuFileBuffer.allocate(length, registerBuffer);
          CuFileReadHandle reader = new CuFileReadHandle(tempFile.getAbsolutePath());
-         HostMemoryBuffer dest = HostMemoryBuffer.allocate(length)) {
+         HostMemoryBuffer dest = hostMemoryAllocator.allocate(length)) {
       reader.read(to, 0);
       dest.copyFromDeviceBuffer(to);
       assertEquals(123456789, dest.getLong(0));
diff --git a/java/src/test/java/ai/rapids/cudf/GatherMapTest.java b/java/src/test/java/ai/rapids/cudf/GatherMapTest.java
index b0e78a2c2cd..8bab049c0af 100644
--- a/java/src/test/java/ai/rapids/cudf/GatherMapTest.java
+++ b/java/src/test/java/ai/rapids/cudf/GatherMapTest.java
@@ -24,6 +24,8 @@
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class GatherMapTest {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   @Test
   void testInvalidBuffer() {
     try (DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(707)) {
@@ -68,7 +70,7 @@ void testInvalidColumnView() {
 
   @Test
   void testToColumnView() {
-    try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(8 * 4)) {
+    try (HostMemoryBuffer hostBuffer = hostMemoryAllocator.allocate(8 * 4)) {
       hostBuffer.setInts(0, new int[]{10, 11, 12, 13, 14, 15, 16, 17}, 0, 8);
       try (DeviceMemoryBuffer devBuffer = DeviceMemoryBuffer.allocate(8*4)) {
         devBuffer.copyFromHostBuffer(hostBuffer);
@@ -78,7 +80,7 @@ void testToColumnView() {
           assertEquals(DType.INT32, view.getType());
           assertEquals(0, view.getNullCount());
           assertEquals(8, view.getRowCount());
-          try (HostMemoryBuffer viewHostBuffer = HostMemoryBuffer.allocate(8 * 4)) {
+          try (HostMemoryBuffer viewHostBuffer = hostMemoryAllocator.allocate(8 * 4)) {
             viewHostBuffer.copyFromDeviceBuffer(view.getData());
             for (int i = 0; i < 8; i++) {
               assertEquals(i + 10, viewHostBuffer.getInt(4*i));
@@ -88,7 +90,7 @@ void testToColumnView() {
           assertEquals(DType.INT32, view.getType());
           assertEquals(0, view.getNullCount());
           assertEquals(2, view.getRowCount());
-          try (HostMemoryBuffer viewHostBuffer = HostMemoryBuffer.allocate(8)) {
+          try (HostMemoryBuffer viewHostBuffer = hostMemoryAllocator.allocate(8)) {
             viewHostBuffer.copyFromDeviceBuffer(view.getData());
             assertEquals(13, viewHostBuffer.getInt(0));
             assertEquals(14, viewHostBuffer.getInt(4));
diff --git a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
index c332ce660d1..ec36b4f82b0 100644
--- a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
+++ b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
@@ -25,13 +25,15 @@
 import static org.junit.jupiter.api.Assertions.*;
 
 public class MemoryBufferTest extends CudfTestBase {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   private static final byte[] BYTES = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
   private static final byte[] EXPECTED = {0, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
   @Test
   public void testAddressOutOfBoundsExceptionWhenCopying() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(-1, from, 0, 16, Cuda.DEFAULT_STREAM));
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(16, from, 0, 16, Cuda.DEFAULT_STREAM));
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, -1, 16, Cuda.DEFAULT_STREAM));
@@ -45,8 +47,8 @@ public void testAddressOutOfBoundsExceptionWhenCopying() {
 
   @Test
   public void testAddressOutOfBoundsExceptionWhenCopyingAsync() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(-1, from, 0, 16, Cuda.DEFAULT_STREAM));
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(16, from, 0, 16, Cuda.DEFAULT_STREAM));
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, -1, 16, Cuda.DEFAULT_STREAM));
@@ -60,10 +62,10 @@ public void testAddressOutOfBoundsExceptionWhenCopyingAsync() {
 
   @Test
   public void testCopyingFromDeviceToDevice() {
-    try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer in = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer out = hostMemoryAllocator.allocate(16)) {
       in.setBytes(0, BYTES, 0, 16);
       from.copyFromHostBuffer(in);
       to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM);
@@ -75,10 +77,10 @@ public void testCopyingFromDeviceToDevice() {
 
   @Test
   public void testCopyingFromDeviceToDeviceAsync() {
-    try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer in = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer out = hostMemoryAllocator.allocate(16)) {
       in.setBytes(0, BYTES, 0, 16);
       from.copyFromHostBuffer(in);
       to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM);
@@ -91,8 +93,8 @@ public void testCopyingFromDeviceToDeviceAsync() {
 
   @Test
   public void testCopyingFromHostToHost() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       from.setBytes(0, BYTES, 0, 16);
       to.setBytes(0, BYTES, 0, 16);
       to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM);
@@ -102,8 +104,8 @@ public void testCopyingFromHostToHost() {
 
   @Test
   public void testCopyingFromHostToHostAsync() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       from.setBytes(0, BYTES, 0, 16);
       to.setBytes(0, BYTES, 0, 16);
       to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM);
@@ -113,9 +115,9 @@ public void testCopyingFromHostToHostAsync() {
 
   @Test
   public void testCopyingFromHostToDevice() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer out = hostMemoryAllocator.allocate(16)) {
       from.setBytes(0, BYTES, 0, 16);
       to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM);
       to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM);
@@ -126,9 +128,9 @@ public void testCopyingFromHostToDevice() {
 
   @Test
   public void testCopyingFromHostToDeviceAsync() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer out = hostMemoryAllocator.allocate(16)) {
       from.setBytes(0, BYTES, 0, 16);
       to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM);
       to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM);
@@ -140,9 +142,9 @@ public void testCopyingFromHostToDeviceAsync() {
 
   @Test
   public void testCopyingFromDeviceToHost() {
-    try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer in = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       in.setBytes(0, BYTES, 0, 16);
       from.copyFromHostBuffer(in);
       to.setBytes(0, BYTES, 0, 16);
@@ -153,9 +155,9 @@ public void testCopyingFromDeviceToHost() {
 
   @Test
   public void testCopyingFromDeviceToHostAsync() {
-    try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer in = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       in.setBytes(0, BYTES, 0, 16);
       from.copyFromHostBuffer(in);
       to.setBytes(0, BYTES, 0, 16);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index f17197ef608..5c0c738a20f 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -75,6 +75,8 @@
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 public class TableTest extends CudfTestBase {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
   private static final File TEST_PARQUET_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.parquet");
   private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
@@ -440,7 +442,7 @@ void testReadJSONTableWithMeta() {
             "{ \"A\": 3, \"B\": 6, \"C\": \"Z\"}\n" +
             "{ \"A\": 4, \"B\": 8, \"C\": \"W\"}\n").getBytes(StandardCharsets.UTF_8);
     final int numBytes = data.length;
-    try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(numBytes)) {
+    try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(numBytes)) {
       hostbuf.setBytes(0, data, 0, numBytes);
       try (Table expected = new Table.TestBuilder()
               .column(1L, 2L, 3L, 4L)
@@ -3465,7 +3467,7 @@ void testSerializationRoundTripConcatOnHostEmpty() throws IOException {
         do {
           head = new JCudfSerialization.SerializedTableHeader(din);
           if (head.wasInitialized()) {
-            HostMemoryBuffer buff = HostMemoryBuffer.allocate(head.getDataLen());
+            HostMemoryBuffer buff = hostMemoryAllocator.allocate(head.getDataLen());
             buffers.add(buff);
             JCudfSerialization.readTableIntoBuffer(din, head, buff);
             assert head.wasDataRead();
@@ -3624,7 +3626,7 @@ void testSerializationRoundTripConcatHostSide() throws IOException {
           do {
             head = new JCudfSerialization.SerializedTableHeader(din);
             if (head.wasInitialized()) {
-              HostMemoryBuffer buff = HostMemoryBuffer.allocate(100 * 1024);
+              HostMemoryBuffer buff = hostMemoryAllocator.allocate(100 * 1024);
               buffers.add(buff);
               JCudfSerialization.readTableIntoBuffer(din, head, buff);
               assert head.wasDataRead();
@@ -3665,7 +3667,7 @@ private void testSerializationRoundTripToHost(Table t) throws IOException {
     JCudfSerialization.SerializedTableHeader header =
             new JCudfSerialization.SerializedTableHeader(din);
     assertTrue(header.wasInitialized());
-    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(header.getDataLen())) {
+    try (HostMemoryBuffer buffer = hostMemoryAllocator.allocate(header.getDataLen())) {
       JCudfSerialization.readTableIntoBuffer(din, header, buffer);
       assertTrue(header.wasDataRead());
       HostColumnVector[] hostColumns =
@@ -3727,7 +3729,7 @@ void testConcatHost() throws IOException {
       DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.toByteArray()));
       JCudfSerialization.SerializedTableHeader header = new JCudfSerialization.SerializedTableHeader(in);
       assert header.wasInitialized();
-      try (HostMemoryBuffer buff = HostMemoryBuffer.allocate(header.getDataLen())) {
+      try (HostMemoryBuffer buff = hostMemoryAllocator.allocate(header.getDataLen())) {
         JCudfSerialization.readTableIntoBuffer(in, header, buff);
         assert header.wasDataRead();
         try (Table result = JCudfSerialization.readAndConcat(
@@ -3758,7 +3760,7 @@ void testSerializationRoundTripSlicedHostSide() throws IOException {
           do {
             head = new JCudfSerialization.SerializedTableHeader(din);
             if (head.wasInitialized()) {
-              HostMemoryBuffer buff = HostMemoryBuffer.allocate(100 * 1024);
+              HostMemoryBuffer buff = hostMemoryAllocator.allocate(100 * 1024);
               buffers.add(buff);
               JCudfSerialization.readTableIntoBuffer(din, head, buff);
               assert head.wasDataRead();
@@ -7985,7 +7987,7 @@ private final class MyBufferConsumer implements HostBufferConsumer, AutoCloseabl
     long offset = 0;
 
     public MyBufferConsumer() {
-      buffer = HostMemoryBuffer.allocate(10 * 1024 * 1024);
+      buffer = hostMemoryAllocator.allocate(10 * 1024 * 1024);
     }
 
     @Override
diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
index ec14a1cfee6..66f4fe39109 100644
--- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
+++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
@@ -26,6 +26,8 @@
 import java.util.Optional;
 
 public class NvcompTest {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
 
   @Test
@@ -68,9 +70,9 @@ void testBatchedLZ4RoundTripAsync() {
         // check the decompressed results against the original
         for (int i = 0; i < numBuffers; ++i) {
           try (HostMemoryBuffer expected =
-                   HostMemoryBuffer.allocate(originalBuffers.get(i).getLength());
+                   hostMemoryAllocator.allocate(originalBuffers.get(i).getLength());
                HostMemoryBuffer actual =
-                   HostMemoryBuffer.allocate(uncompressedBuffers.get(i).getLength())) {
+                   hostMemoryAllocator.allocate(uncompressedBuffers.get(i).getLength())) {
             Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE);
             Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE);
             Assertions.assertEquals(expected.getLength(), actual.getLength(),
@@ -114,7 +116,7 @@ private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) {
     }
     long[] bufferData = Arrays.copyOfRange(data, dataStart, dataStart + dataLength + 1);
     DeviceMemoryBuffer devBuffer = null;
-    try (HostMemoryBuffer hmb = HostMemoryBuffer.allocate(bufferData.length * 8)) {
+    try (HostMemoryBuffer hmb = hostMemoryAllocator.allocate(bufferData.length * 8)) {
       hmb.setLongs(0, bufferData, 0, bufferData.length);
       devBuffer = DeviceMemoryBuffer.allocate(hmb.getLength());
       devBuffer.copyFromHostBuffer(hmb);

From 171fc91a6e67e50cce8391457f92729044ddc86b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 23 Aug 2023 18:07:38 -0500
Subject: [PATCH 091/230] Fix `index` of `Groupby.apply` results when it is
 performed on empty objects (#13944)

closes #13939
This PR fixes two issues with `Groupby.apply`, where the index of the result was not being set correctly and there is a corner case for `bool` dtype that has to be handled for `sum` & `product` operations.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13944
---
 python/cudf/cudf/core/groupby/groupby.py | 13 +++++++++--
 python/cudf/cudf/tests/test_groupby.py   | 29 ++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b0be97915f2..e97ea8081e8 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -21,7 +21,7 @@
 from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
-from cudf.api.types import is_list_like
+from cudf.api.types import is_bool_dtype, is_list_like
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, arange, as_column
 from cudf.core.column_accessor import ColumnAccessor
@@ -1373,7 +1373,16 @@ def mult(df):
         """
 
         if self.obj.empty:
-            return self.obj
+            res = self.obj.copy(deep=True)
+            res.index = self.grouping.keys
+            if function in {"sum", "product"}:
+                # For `sum` & `product`, boolean types
+                # will need to result in `int64` type.
+                for name, col in res._data.items():
+                    if is_bool_dtype(col.dtype):
+                        res._data[name] = col.astype("int")
+            return res
+
         if not callable(function):
             raise TypeError(f"type {type(function)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped()
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index a3b205cc16b..0e96b97e1e1 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3342,6 +3342,35 @@ def test_group_by_pandas_sort_order(groups, sort):
         )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
+)
+@pytest.mark.parametrize(
+    "apply_op",
+    ["sum", "min", "max", "idxmax"],
+)
+def test_group_by_empty_apply(request, dtype, apply_op):
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=(dtype == "datetime64[ns]" and apply_op == "sum"),
+            reason=("sum isn't supported for datetime64[ns]"),
+        )
+    )
+    gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
+    pdf = gdf.to_pandas()
+
+    gg = gdf.groupby("a")["c"]
+    pg = pdf.groupby("a")["c"]
+
+    assert_eq(
+        gg.apply(apply_op),
+        pg.apply(apply_op),
+        check_dtype=True,
+        check_index_type=True,
+    )
+
+
 def test_groupby_consecutive_operations():
     df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
     pdf = df.to_pandas()

From 6ed42d7c26ab19f86bb31e88eec7e752359a90d5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 23 Aug 2023 20:32:54 -0500
Subject: [PATCH 092/230] Fix type mismatch in groupby reduction for empty
 objects (#13942)

closes #13941

This PR preserves the column types, for group by reduction operations that are performed on empty objects.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13942
---
 python/cudf/cudf/core/groupby/groupby.py | 23 +++++++++++++++++--
 python/cudf/cudf/tests/test_groupby.py   | 28 ++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e97ea8081e8..63c9dd837a8 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -553,8 +553,8 @@ def agg(self, func):
             orig_dtypes,
         ):
             for agg, col in zip(aggs, cols):
+                agg_name = agg.__name__ if callable(agg) else agg
                 if multilevel:
-                    agg_name = agg.__name__ if callable(agg) else agg
                     key = (col_name, agg_name)
                 else:
                     key = col_name
@@ -564,7 +564,26 @@ def agg(self, func):
                 ):
                     # Structs lose their labels which we reconstruct here
                     col = col._with_type_metadata(cudf.ListDtype(orig_dtype))
-                data[key] = col
+
+                if (
+                    self.obj.empty
+                    and (
+                        isinstance(agg_name, str)
+                        and agg_name in Reducible._SUPPORTED_REDUCTIONS
+                    )
+                    and len(col) == 0
+                    and not isinstance(
+                        col,
+                        (
+                            cudf.core.column.ListColumn,
+                            cudf.core.column.StructColumn,
+                            cudf.core.column.DecimalBaseColumn,
+                        ),
+                    )
+                ):
+                    data[key] = col.astype(orig_dtype)
+                else:
+                    data[key] = col
         data = ColumnAccessor(data, multiindex=multilevel)
         if not multilevel:
             data = data.rename_levels({np.nan: None}, level=0)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 0e96b97e1e1..784cabaa542 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3342,6 +3342,33 @@ def test_group_by_pandas_sort_order(groups, sort):
         )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
+)
+@pytest.mark.parametrize(
+    "reduce_op",
+    [
+        "min",
+        "max",
+        "idxmin",
+        "idxmax",
+        "first",
+        "last",
+    ],
+)
+def test_group_by_empty_reduction(dtype, reduce_op):
+    gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
+    pdf = gdf.to_pandas()
+
+    gg = gdf.groupby("a")["c"]
+    pg = pdf.groupby("a")["c"]
+
+    assert_eq(
+        getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True
+    )
+
+
 @pytest.mark.parametrize(
     "dtype",
     ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
@@ -3357,6 +3384,7 @@ def test_group_by_empty_apply(request, dtype, apply_op):
             reason=("sum isn't supported for datetime64[ns]"),
         )
     )
+
     gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
     pdf = gdf.to_pandas()
 

From 83f9cbfbe629680f5e7c0de679bf94eb3f971159 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 23 Aug 2023 20:33:44 -0500
Subject: [PATCH 093/230] Raise error for string types in `nsmallest` and
 `nlargest` (#13946)

closes #13945

This PR contains changes that raises an error message exactly matching pandas for `nsmallest` and `nlargest`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13946
---
 python/cudf/cudf/core/indexed_frame.py   | 14 ++++++++++++++
 python/cudf/cudf/tests/test_dataframe.py | 13 +++++++++++++
 python/cudf/cudf/tests/test_series.py    | 10 ++++++++++
 3 files changed, 37 insertions(+)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8e6cdbb2787..4c6eb3a50e9 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2465,6 +2465,20 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
         if isinstance(columns, str):
             columns = [columns]
 
+        method = "nlargest" if largest else "nsmallest"
+        for col in columns:
+            if isinstance(self._data[col], cudf.core.column.StringColumn):
+                if isinstance(self, cudf.DataFrame):
+                    error_msg = (
+                        f"Column '{col}' has dtype {self._data[col].dtype}, "
+                        f"cannot use method '{method}' with this dtype"
+                    )
+                else:
+                    error_msg = (
+                        f"Cannot use method '{method}' with "
+                        f"dtype {self._data[col].dtype}"
+                    )
+                raise TypeError(error_msg)
         if len(self) == 0:
             return self
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 0501874ecda..3c84cfe48c4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10316,3 +10316,16 @@ def test_dataframe_reindex_with_index_names(index_data, name):
     expected = pdf.reindex(index_data)
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"])
+def test_dataframe_nlargest_nsmallest_str_error(attr):
+    gdf = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
+    pdf = gdf.to_pandas()
+
+    assert_exceptions_equal(
+        getattr(gdf, attr),
+        getattr(pdf, attr),
+        ([], {"n": 1, "columns": ["a", "b"]}),
+        ([], {"n": 1, "columns": ["a", "b"]}),
+    )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 2c4befb8393..db1249213f8 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2244,3 +2244,13 @@ def test_series_typecast_to_object():
         assert new_series[0] == "1970-01-01 00:00:00.000000001"
         new_series = actual.astype(np.dtype("object"))
         assert new_series[0] == "1970-01-01 00:00:00.000000001"
+
+
+@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"])
+def test_series_nlargest_nsmallest_str_error(attr):
+    gs = cudf.Series(["a", "b", "c", "d", "e"])
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        getattr(gs, attr), getattr(ps, attr), ([], {"n": 1}), ([], {"n": 1})
+    )

From f70f2cd415a1d8b2af2f0343fc2003101691f5c4 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 24 Aug 2023 11:06:13 +0200
Subject: [PATCH 094/230] Enable hugepage for arrow host allocations (#13914)

This PR enables Transparent Huge Pages (THP) for large (>4MB) arrow allocations (host memory only).


### Performance results  on a DGX-1 (`dgx14`)
|                |  8MB  |  80MB | 800MB |  8GB  | Method                             |
|:--------------:|:-----:|:-----:|:-----:|:-----:|------------------------------------|
| cudf-native    | 0.006 | 0.049 | 0.485 | 4.787 | `df.to_arrow()` (branch-23.10)     |
| Dask-serialize | 0.004 | 0.032 | 0.310 | 3.122 | `distributed.protocol.serialize(df)` |
| cudf-hugepage  | 0.004 | 0.030 | 0.299 | 3.046 | `df.to_arrow()` (this PR)          |
| speedup        | 1.5   | 1.63  | 1.62  | 1.57  |  cudf-native vs. cudf-hugepage         |

Notice, Dask-serialize also use THP, which is why its performance is on par with cudf-hugepage.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/13914
---
 cpp/src/interop/detail/arrow_allocator.cpp | 39 +++++++++++++++++++---
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp
index d802588adb5..41fb68a5748 100644
--- a/cpp/src/interop/detail/arrow_allocator.cpp
+++ b/cpp/src/interop/detail/arrow_allocator.cpp
@@ -16,9 +16,40 @@
 
 #include <cudf/detail/interop.hpp>
 
+#include <memory>
+#include <sys/mman.h>
+#include <unistd.h>
+
 namespace cudf {
 namespace detail {
 
+/*
+  Enable Transparent Huge Pages (THP) for large (>4MB) allocations.
+  `buf` is returned untouched.
+  Enabling THP can improve performance of device-host memory transfers
+  significantly, see <https://github.com/rapidsai/cudf/pull/13914>.
+*/
+template <typename T>
+T enable_hugepage(T&& buf)
+{
+  if (buf->size() < (1u << 22u)) {  // Smaller than 4 MB
+    return std::move(buf);
+  }
+
+#ifdef MADV_HUGEPAGE
+  const auto pagesize = sysconf(_SC_PAGESIZE);
+  void* addr          = const_cast<uint8_t*>(buf->data());
+  if (addr == nullptr) { return std::move(buf); }
+  auto length{static_cast<std::size_t>(buf->size())};
+  if (std::align(pagesize, pagesize, addr, length)) {
+    // Intentionally not checking for errors that may be returned by older kernel versions;
+    // optimistically tries enabling huge pages.
+    madvise(addr, length, MADV_HUGEPAGE);
+  }
+#endif
+  return std::move(buf);
+}
+
 std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr)
 {
   /*
@@ -28,9 +59,9 @@ std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::
   To work around this issue we compile an allocation shim in C++ and use
   that from our cuda sources
   */
-  auto result = arrow::AllocateBuffer(size, ar_mr);
+  arrow::Result<std::unique_ptr<arrow::Buffer>> result = arrow::AllocateBuffer(size, ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
-  return std::move(result).ValueOrDie();
+  return enable_hugepage(std::move(result).ValueOrDie());
 }
 
 std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr)
@@ -42,9 +73,9 @@ std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::
   To work around this issue we compile an allocation shim in C++ and use
   that from our cuda sources
   */
-  auto result = arrow::AllocateBitmap(size, ar_mr);
+  arrow::Result<std::shared_ptr<arrow::Buffer>> result = arrow::AllocateBitmap(size, ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
-  return std::move(result).ValueOrDie();
+  return enable_hugepage(std::move(result).ValueOrDie());
 }
 
 }  // namespace detail

From d497dd895f21c921c3551196b904b018f58e1e78 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 24 Aug 2023 22:02:11 +0800
Subject: [PATCH 095/230] Rewrite `DataFrame.stack` to support multi level
 column names (#13927)

This PR rewrites `DataFrame.stack()`. Adding support to stacking multiple levels in the dataframe.
User can now specify one or more levels from the column names to stack. Example:

```python
>>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
...                                        ('weight', 'pounds')])
>>> df_multi_level_cols1 = cudf.DataFrame([[1, 2], [2, 4]],
...                                     index=['cat', 'dog'],
...                                     columns=multicol1)
>>> df_multi_level_cols1.stack(0)
            kg  pounds
cat weight   1       2
dog weight   2       4
>>> df_multi_level_cols1.stack([0, 1])
cat  weight  kg        1
             pounds    2
dog  weight  kg        2
             pounds    4
dtype: int64
```

The implementation heavily uses pandas index methods on the column axis. This assumes
that the width of the cudf column is limited.

The combination of `len(level) > 1 and dropna=False` is currently unsupported. The corresponding
behavior in pandas is due to be deprecated in 3.0. See pandas-dev/pandas#53515.

closes #13739

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13927
---
 python/cudf/cudf/core/dataframe.py     | 308 +++++++++++++++++++++----
 python/cudf/cudf/tests/test_reshape.py |  95 ++++++++
 2 files changed, 363 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c80c2a7272e..3f89f78d278 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6441,40 +6441,210 @@ def to_orc(
     def stack(self, level=-1, dropna=True):
         """Stack the prescribed level(s) from columns to index
 
-        Return a reshaped Series
+        Return a reshaped DataFrame or Series having a multi-level
+        index with one or more new inner-most levels compared to
+        the current DataFrame. The new inner-most levels are created
+        by pivoting the columns of the current dataframe:
+
+          - if the columns have a single level, the output is a Series;
+          - if the columns have multiple levels, the new index
+            level(s) is (are) taken from the prescribed level(s) and
+            the output is a DataFrame.
 
         Parameters
         ----------
+        level : int, str, list default -1
+            Level(s) to stack from the column axis onto the index axis,
+            defined as one index or label, or a list of indices or labels.
         dropna : bool, default True
-            Whether to drop rows in the resulting Series with missing values.
+            Whether to drop rows in the resulting Frame/Series with missing
+            values. When multiple levels are specified, `dropna==False` is
+            unsupported.
 
         Returns
         -------
-        The stacked cudf.Series
+        DataFrame or Series
+            Stacked dataframe or series.
+
+        See Also
+        --------
+        DataFrame.unstack : Unstack prescribed level(s) from index axis
+             onto column axis.
+        DataFrame.pivot : Reshape dataframe from long format to wide
+             format.
+        DataFrame.pivot_table : Create a spreadsheet-style pivot table
+             as a DataFrame.
+
+        Notes
+        -----
+        The function is named by analogy with a collection of books
+        being reorganized from being side by side on a horizontal
+        position (the columns of the dataframe) to being stacked
+        vertically on top of each other (in the index of the
+        dataframe).
 
         Examples
         --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]})
-        >>> df.stack()
-        0  a    0
-           b    1
-        1  a    1
-           b    2
-        2  a    3
-           b    4
+        **Single level columns**
+
+        >>> df_single_level_cols = cudf.DataFrame([[0, 1], [2, 3]],
+        ...                                     index=['cat', 'dog'],
+        ...                                     columns=['weight', 'height'])
+
+        Stacking a dataframe with a single level column axis returns a Series:
+
+        >>> df_single_level_cols
+             weight height
+        cat       0      1
+        dog       2      3
+        >>> df_single_level_cols.stack()
+        cat  height    1
+             weight    0
+        dog  height    3
+             weight    2
         dtype: int64
+
+        **Multi level columns: simple case**
+
+        >>> import pandas as pd
+        >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+        ...                                        ('weight', 'pounds')])
+        >>> df_multi_level_cols1 = cudf.DataFrame([[1, 2], [2, 4]],
+        ...                                     index=['cat', 'dog'],
+        ...                                     columns=multicol1)
+
+        Stacking a dataframe with a multi-level column axis:
+
+        >>> df_multi_level_cols1
+             weight
+                 kg    pounds
+        cat       1        2
+        dog       2        4
+        >>> df_multi_level_cols1.stack()
+                    weight
+        cat kg           1
+            pounds       2
+        dog kg           2
+            pounds       4
+
+        **Missing values**
+
+        >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+        ...                                        ('height', 'm')])
+        >>> df_multi_level_cols2 = cudf.DataFrame([[1.0, 2.0], [3.0, 4.0]],
+        ...                                     index=['cat', 'dog'],
+        ...                                     columns=multicol2)
+
+        It is common to have missing values when stacking a dataframe
+        with multi-level columns, as the stacked dataframe typically
+        has more values than the original dataframe. Missing values
+        are filled with NULLs:
+
+        >>> df_multi_level_cols2
+            weight height
+                kg      m
+        cat    1.0    2.0
+        dog    3.0    4.0
+        >>> df_multi_level_cols2.stack()
+            height weight
+        cat kg   <NA>    1.0
+            m     2.0   <NA>
+        dog kg   <NA>    3.0
+            m     4.0   <NA>
+
+        **Prescribing the level(s) to be stacked**
+
+        The first parameter controls which level or levels are stacked:
+
+        >>> df_multi_level_cols2.stack(0)
+                    kg     m
+        cat height  <NA>   2.0
+            weight   1.0  <NA>
+        dog height  <NA>   4.0
+            weight   3.0  <NA>
+
+        >>> df_multi_level_cols2.stack([0, 1])
+        cat  height  m     2.0
+             weight  kg    1.0
+        dog  height  m     4.0
+             weight  kg    3.0
+        dtype: float64
         """
-        assert level in (None, -1)
-        repeated_index = self.index.repeat(self.shape[1])
-        name_index = libcudf.reshape.tile(
-            [as_column(self._column_names)], self.shape[0]
-        )
-        new_index_columns = [*repeated_index._columns, *name_index]
-        if isinstance(self._index, MultiIndex):
-            index_names = self._index.names + [None]
+
+        if isinstance(level, (int, str)):
+            level = [level]
+        elif isinstance(level, list):
+            if not all(isinstance(lv, (int, str)) for lv in level):
+                raise ValueError(
+                    "level must be either an int/str, or a list of int/str."
+                )
         else:
-            index_names = [None] * len(new_index_columns)
+            raise ValueError(
+                "level must be either an int/str, or a list of int/str."
+            )
+
+        level = [level] if not isinstance(level, list) else level
+
+        if len(level) > 1 and not dropna:
+            raise NotImplementedError(
+                "When stacking multiple levels, setting `dropna` to False "
+                "will generate new column combination that does not exist "
+                "in original dataframe. This behavior is unsupported in "
+                "cuDF. See pandas deprecation note: "
+                "https://github.com/pandas-dev/pandas/issues/53515"
+            )
+
+        # Compute the columns to stack based on specified levels
+
+        level_indices: list[int] = []
+
+        # If all passed in level names match up to the dataframe column's level
+        # names, cast them to indices
+        if all(lv in self._data.level_names for lv in level):
+            level_indices = [self._data.level_names.index(lv) for lv in level]
+        elif not all(isinstance(lv, int) for lv in level):
+            raise ValueError(
+                "`level` must either be a list of names or positions, not a "
+                "mixture of both."
+            )
+        else:
+            # Must be a list of positions, normalize negative positions
+            level_indices = [
+                lv + self._data.nlevels if lv < 0 else lv for lv in level
+            ]
+
+        unnamed_levels_indices = [
+            i for i in range(self._data.nlevels) if i not in level_indices
+        ]
+        has_unnamed_levels = len(unnamed_levels_indices) > 0
+
+        column_name_idx = self._data.to_pandas_index()
+        # Construct new index from the levels specified by `level`
+        named_levels = pd.MultiIndex.from_arrays(
+            [column_name_idx.get_level_values(lv) for lv in level_indices]
+        )
+
+        # Since `level` may only specify a subset of all levels, `unique()` is
+        # required to remove duplicates. In pandas, the order of the keys in
+        # the specified levels are always sorted.
+        unique_named_levels = named_levels.unique().sort_values()
+
+        # Each index from the original dataframe should repeat by the number
+        # of unique values in the named_levels
+        repeated_index = self.index.repeat(len(unique_named_levels))
+
+        # Each column name should tile itself by len(df) times
+        tiled_index = libcudf.reshape.tile(
+            [
+                as_column(unique_named_levels.get_level_values(i))
+                for i in range(unique_named_levels.nlevels)
+            ],
+            self.shape[0],
+        )
+
+        # Assemble the final index
+        new_index_columns = [*repeated_index._columns, *tiled_index]
+        index_names = [*self._index.names, *unique_named_levels.names]
         new_index = MultiIndex.from_frame(
             DataFrame._from_data(
                 dict(zip(range(0, len(new_index_columns)), new_index_columns))
@@ -6482,30 +6652,88 @@ def stack(self, level=-1, dropna=True):
             names=index_names,
         )
 
-        # Collect datatypes and cast columns as that type
-        common_type = np.result_type(*self.dtypes)
-        homogenized = DataFrame._from_data(
-            {
-                c: (
-                    self._data[c].astype(common_type)
-                    if not np.issubdtype(self._data[c].dtype, common_type)
-                    else self._data[c]
-                )
-                for c in self._data
-            }
+        # Compute the column indices that serves as the input for
+        # `interleave_columns`
+        column_idx_df = pd.DataFrame(
+            data=range(len(self._data)), index=named_levels
         )
 
-        result = Series._from_data(
-            {
-                None: libcudf.reshape.interleave_columns(
-                    [*homogenized._columns]
+        column_indices: list[list[int]] = []
+        if has_unnamed_levels:
+            unnamed_level_values = list(
+                map(column_name_idx.get_level_values, unnamed_levels_indices)
+            )
+            unnamed_level_values = pd.MultiIndex.from_arrays(
+                unnamed_level_values
+            )
+
+        def unnamed_group_generator():
+            if has_unnamed_levels:
+                for _, grpdf in column_idx_df.groupby(by=unnamed_level_values):
+                    # When stacking part of the levels, some combinations
+                    # of keys may not be present in this group but can be
+                    # present in others. Reindexing with the globally computed
+                    # `unique_named_levels` assigns -1 to these key
+                    # combinations, representing an all-null column that
+                    # is used in the subsequent libcudf call.
+                    yield grpdf.reindex(
+                        unique_named_levels, axis=0, fill_value=-1
+                    ).sort_index().values
+            else:
+                yield column_idx_df.sort_index().values
+
+        column_indices = list(unnamed_group_generator())
+
+        # For each of the group constructed from the unnamed levels,
+        # invoke `interleave_columns` to stack the values.
+        stacked = []
+
+        for column_idx in column_indices:
+            # Collect columns based on indices, append None for -1 indices.
+            columns = [
+                None if i == -1 else self._data.select_by_index(i).columns[0]
+                for i in column_idx
+            ]
+
+            # Collect datatypes and cast columns as that type
+            common_type = np.result_type(
+                *(col.dtype for col in columns if col is not None)
+            )
+
+            all_nulls = functools.cache(
+                functools.partial(
+                    column_empty, self.shape[0], common_type, masked=True
                 )
-            },
-            index=new_index,
-        )
+            )
+
+            # homogenize the dtypes of the columns
+            homogenized = [
+                col.astype(common_type) if col is not None else all_nulls()
+                for col in columns
+            ]
+
+            stacked.append(libcudf.reshape.interleave_columns(homogenized))
+
+        # Construct the resulting dataframe / series
+        if not has_unnamed_levels:
+            result = Series._from_data(
+                data={None: stacked[0]}, index=new_index
+            )
+        else:
+            if unnamed_level_values.nlevels == 1:
+                unnamed_level_values = unnamed_level_values.get_level_values(0)
+            unnamed_level_values = unnamed_level_values.unique().sort_values()
+
+            data = ColumnAccessor(
+                dict(zip(unnamed_level_values, stacked)),
+                isinstance(unnamed_level_values, pd.MultiIndex),
+                unnamed_level_values.names,
+            )
+
+            result = DataFrame._from_data(data, index=new_index)
 
         if dropna:
-            return result.dropna()
+            return result.dropna(how="all")
         else:
             return result
 
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 8a8fb9a2002..0a07eecd096 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -151,6 +151,101 @@ def test_df_stack_reset_index():
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "columns",
+    [
+        pd.MultiIndex.from_tuples(
+            [("A", "cat"), ("A", "dog"), ("B", "cat"), ("B", "dog")],
+            names=["letter", "animal"],
+        ),
+        pd.MultiIndex.from_tuples(
+            [("A", "cat"), ("B", "bird"), ("A", "dog"), ("B", "dog")],
+            names=["letter", "animal"],
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "level",
+    [
+        -1,
+        0,
+        1,
+        "letter",
+        "animal",
+        [0, 1],
+        [1, 0],
+        ["letter", "animal"],
+        ["animal", "letter"],
+    ],
+)
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.RangeIndex(2, name="range"),
+        pd.Index([9, 8], name="myindex"),
+        pd.MultiIndex.from_arrays(
+            [
+                ["A", "B"],
+                [101, 102],
+            ],
+            names=["first", "second"],
+        ),
+    ],
+)
+@pytest.mark.parametrize("dropna", [True, False])
+def test_df_stack_multiindex_column_axis(columns, index, level, dropna):
+    if isinstance(level, list) and len(level) > 1 and not dropna:
+        pytest.skip(
+            "Stacking multiple levels with dropna==False is unsupported."
+        )
+
+    pdf = pd.DataFrame(
+        data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    got = gdf.stack(level=level, dropna=dropna)
+    expect = pdf.stack(level=level, dropna=dropna)
+
+    assert_eq(expect, got, check_dtype=False)
+
+
+def test_df_stack_mixed_dtypes():
+    pdf = pd.DataFrame(
+        {
+            "A": pd.Series([1, 2, 3], dtype="f4"),
+            "B": pd.Series([4, 5, 6], dtype="f8"),
+        }
+    )
+
+    gdf = cudf.from_pandas(pdf)
+
+    got = gdf.stack()
+    expect = pdf.stack()
+
+    assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize("level", [["animal", "hair_length"], [1, 2]])
+def test_df_stack_multiindex_column_axis_pd_example(level):
+    columns = pd.MultiIndex.from_tuples(
+        [
+            ("A", "cat", "long"),
+            ("B", "cat", "long"),
+            ("A", "dog", "short"),
+            ("B", "dog", "short"),
+        ],
+        names=["exp", "animal", "hair_length"],
+    )
+
+    df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
+
+    expect = df.stack(level=level)
+    got = cudf.from_pandas(df).stack(level=level)
+
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("num_rows", [1, 2, 10, 1000])
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize(

From d2efb1f5cab49a823b4523a470c2478d86d09e7f Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz@gmail.com>
Date: Thu, 24 Aug 2023 21:03:57 +0200
Subject: [PATCH 096/230] Add `bytes_per_second` to compiled binaryop benchmark
 (#13938)

To add `bytes_per_second`, a call to `.SetBytesProcessed()` with the number of written and read bytes is added to the benchmark.

This patch relates to #13735.

Authors:
  - Martin Marenz (https://github.com/Blonck)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/13938
---
 cpp/benchmarks/binaryop/compiled_binaryop.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index fbba38431dd..a1131df4472 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -42,6 +42,10 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
     cuda_event_timer timer(state, true);
     cudf::binary_operation(lhs, rhs, binop, output_dtype);
   }
+
+  // use number of bytes read and written to global memory
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
+                          (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut)));
 }
 
 // TODO tparam boolean for null.

From ff99f98103a4858a2402a1a32b4e04515c1c4e9f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 24 Aug 2023 15:28:09 -0700
Subject: [PATCH 097/230] Use `thread_index_type` to avoid index overflow in
 grid-stride loops (#13895)

This PR checks all related files under `src/hash`, `src/bitmask` and `src/transform` folders and fixes potential index overflow issues by using `thread_index_type`.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/13895
---
 cpp/src/bitmask/null_mask.cu    | 24 ++++++++++++++----------
 cpp/src/transform/jit/kernel.cu | 15 ++++++---------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index bbe603dfdbc..33dc7e0556b 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -104,13 +104,15 @@ __global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
                                      bool valid,
                                      size_type number_of_mask_words)
 {
-  auto x                  = destination + word_index(begin_bit);
-  auto const last_word    = word_index(end_bit) - word_index(begin_bit);
-  bitmask_type fill_value = valid ? 0xffff'ffff : 0;
+  auto x                            = destination + word_index(begin_bit);
+  thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
+  bitmask_type fill_value           = valid ? 0xffff'ffff : 0;
 
-  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
+  thread_index_type const stride = blockDim.x * gridDim.x;
+
+  for (thread_index_type destination_word_index = grid_1d::global_thread_id();
        destination_word_index < number_of_mask_words;
-       destination_word_index += blockDim.x * gridDim.x) {
+       destination_word_index += stride) {
     if (destination_word_index == 0 || destination_word_index == last_word) {
       bitmask_type mask = ~bitmask_type{0};
       if (destination_word_index == 0) {
@@ -189,9 +191,10 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
                                     size_type source_end_bit,
                                     size_type number_of_mask_words)
 {
-  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
+  thread_index_type const stride = blockDim.x * gridDim.x;
+  for (thread_index_type destination_word_index = grid_1d::global_thread_id();
        destination_word_index < number_of_mask_words;
-       destination_word_index += blockDim.x * gridDim.x) {
+       destination_word_index += stride) {
     destination[destination_word_index] = detail::get_mask_offset_word(
       source, destination_word_index, source_begin_bit, source_end_bit);
   }
@@ -261,14 +264,15 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask,
 
   auto const first_word_index{word_index(first_bit_index)};
   auto const last_word_index{word_index(last_bit_index)};
-  auto const tid         = threadIdx.x + blockIdx.x * blockDim.x;
-  auto thread_word_index = tid + first_word_index;
+  thread_index_type const tid         = grid_1d::global_thread_id();
+  thread_index_type const stride      = blockDim.x * gridDim.x;
+  thread_index_type thread_word_index = tid + first_word_index;
   size_type thread_count{0};
 
   // First, just count the bits in all words
   while (thread_word_index <= last_word_index) {
     thread_count += __popc(bitmask[thread_word_index]);
-    thread_word_index += blockDim.x * gridDim.x;
+    thread_word_index += stride;
   }
 
   // Subtract any slack bits counted from the first and last word
diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu
index 3360ac8cf77..0170cc50c6f 100644
--- a/cpp/src/transform/jit/kernel.cu
+++ b/cpp/src/transform/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,15 +37,12 @@ namespace jit {
 template <typename TypeOut, typename TypeIn>
 __global__ void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
 {
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
+  // cannot use global_thread_id utility due to a JIT build issue by including
+  // the `cudf/detail/utilities/cuda.cuh` header
+  thread_index_type const start  = threadIdx.x + blockIdx.x * blockDim.x;
+  thread_index_type const stride = blockDim.x * gridDim.x;
 
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
+  for (auto i = start; i < static_cast<thread_index_type>(size); i += stride) {
     GENERIC_UNARY_OP(&out_data[i], in_data[i]);
   }
 }

From 6095a92395e32d96ad3595c5610d52b62886cc20 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 24 Aug 2023 16:28:46 -0700
Subject: [PATCH 098/230] Read FIXED_LEN_BYTE_ARRAY as binary in parquet reader
 (#13437)

Closes #12590

This PR adds support of reading `FIXED_LEN_BYTE_ARRAY` as lists of `INT8` in the parquet reader.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13437
---
 cpp/src/io/parquet/page_data.cu               |  16 +++-
 cpp/src/io/parquet/page_decode.cuh            |  40 +++++-----
 cpp/src/io/parquet/page_string_decode.cu      |  69 ++++++++++--------
 cpp/src/io/parquet/parquet_gpu.hpp            |   7 +-
 cpp/src/io/parquet/reader_impl.cpp            |  12 ++-
 .../data/parquet/fixed_len_byte_array.parquet | Bin 0 -> 259 bytes
 python/cudf/cudf/tests/test_parquet.py        |   9 +++
 7 files changed, 95 insertions(+), 58 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index e30df2d1f98..c26802aa3c2 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -48,7 +48,8 @@ inline __device__ void gpuOutputString(volatile page_state_s* s,
                                        void* dstv)
 {
   auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
-  if (s->dtype_len == 4) {
+  // make sure to only hash `BYTE_ARRAY` when specified with the output type size
+  if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) {
     // Output hash. This hash value is used if the option to convert strings to
     // categoricals is enabled. The seed value is chosen arbitrarily.
     uint32_t constexpr hash_seed = 33;
@@ -456,8 +457,12 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
   } else {
-    out_thread0 =
-      ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
+    switch (s->col.data_type & 7) {
+      case BOOLEAN: [[fallthrough]];
+      case BYTE_ARRAY: [[fallthrough]];
+      case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break;
+      default: out_thread0 = 32;
+    }
   }
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
@@ -494,7 +499,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
         src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
       } else if ((s->col.data_type & 7) == BOOLEAN) {
         src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
-      } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+      } else if ((s->col.data_type & 7) == BYTE_ARRAY or
+                 (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
       }
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
@@ -564,6 +570,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
               }
               break;
           }
+        } else if (dtype == FIXED_LEN_BYTE_ARRAY) {
+          gpuOutputString(s, sb, val_src_pos, dst);
         } else if (dtype == INT96) {
           gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
         } else if (dtype_len == 8) {
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index e172382e23a..26e3c951b2e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -421,13 +421,15 @@ __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
     int k              = s->dict_val;
 
     while (pos < target_pos) {
-      int len;
-      if (k + 4 <= dict_size) {
-        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
-        k += 4;
-        if (k + len > dict_size) { len = 0; }
+      int len = 0;
+      if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+        if (k < dict_size) { len = s->dtype_len_in; }
       } else {
-        len = 0;
+        if (k + 4 <= dict_size) {
+          len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+          k += 4;
+          if (k + len > dict_size) { len = 0; }
+        }
       }
       if constexpr (!sizes_only) {
         sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
@@ -1154,16 +1156,20 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       }
       // Special check for downconversions
       s->dtype_len_in = s->dtype_len;
-      if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) {
-        s->dtype_len = [dtype_len = s->dtype_len]() {
-          if (dtype_len <= sizeof(int32_t)) {
-            return sizeof(int32_t);
-          } else if (dtype_len <= sizeof(int64_t)) {
-            return sizeof(int64_t);
-          } else {
-            return sizeof(__int128_t);
-          }
-        }();
+      if (data_type == FIXED_LEN_BYTE_ARRAY) {
+        if (s->col.converted_type == DECIMAL) {
+          s->dtype_len = [dtype_len = s->dtype_len]() {
+            if (dtype_len <= sizeof(int32_t)) {
+              return sizeof(int32_t);
+            } else if (dtype_len <= sizeof(int64_t)) {
+              return sizeof(int64_t);
+            } else {
+              return sizeof(__int128_t);
+            }
+          }();
+        } else {
+          s->dtype_len = sizeof(string_index_pair);
+        }
       } else if (data_type == INT32) {
         if (dtype_len_out == 1) {
           // INT8 output
@@ -1219,7 +1225,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
               uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
               // if this is a string column, then dtype_len is a lie. data will be offsets rather
               // than (ptr,len) tuples.
-              if (data_type == BYTE_ARRAY && s->dtype_len != 4) { len = sizeof(cudf::size_type); }
+              if (is_string_col(s->col)) { len = sizeof(cudf::size_type); }
               nesting_info->data_out += (output_offset * len);
             }
             if (nesting_info->string_out != nullptr) {
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 331cc72f119..1ac4c95f713 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -521,39 +521,44 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     pp->num_valids = s->page.num_valids;
   }
 
-  // now process string info in the range [start_value, end_value)
-  // set up for decoding strings...can be either plain or dictionary
-  auto const& col          = s->col;
-  uint8_t const* data      = s->data_start;
-  uint8_t const* const end = s->data_end;
-  uint8_t const* dict_base = nullptr;
-  int dict_size            = 0;
-  size_t str_bytes         = 0;
-
-  switch (pp->encoding) {
-    case Encoding::PLAIN_DICTIONARY:
-    case Encoding::RLE_DICTIONARY:
-      // RLE-packed dictionary indices, first byte indicates index length in bits
-      if (col.str_dict_index) {
-        // String dictionary: use index
-        dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
-        dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
-      } else {
-        dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
-        dict_size = col.page_info[0].uncompressed_page_size;
-      }
+  auto const& col  = s->col;
+  size_t str_bytes = 0;
+  // short circuit for FIXED_LEN_BYTE_ARRAY
+  if ((col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+    str_bytes = pp->num_valids * s->dtype_len_in;
+  } else {
+    // now process string info in the range [start_value, end_value)
+    // set up for decoding strings...can be either plain or dictionary
+    uint8_t const* data      = s->data_start;
+    uint8_t const* const end = s->data_end;
+    uint8_t const* dict_base = nullptr;
+    int dict_size            = 0;
+
+    switch (pp->encoding) {
+      case Encoding::PLAIN_DICTIONARY:
+      case Encoding::RLE_DICTIONARY:
+        // RLE-packed dictionary indices, first byte indicates index length in bits
+        if (col.str_dict_index) {
+          // String dictionary: use index
+          dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
+          dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
+        } else {
+          dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
+          dict_size = col.page_info[0].uncompressed_page_size;
+        }
 
-      // FIXME: need to return an error condition...this won't actually do anything
-      if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
-
-      str_bytes = totalDictEntriesSize(
-        data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
-      break;
-    case Encoding::PLAIN:
-      dict_size = static_cast<int32_t>(end - data);
-      str_bytes = is_bounds_pg ? totalPlainEntriesSize(data, dict_size, start_value, end_value)
-                               : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls);
-      break;
+        // FIXME: need to return an error condition...this won't actually do anything
+        if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
+
+        str_bytes = totalDictEntriesSize(
+          data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
+        break;
+      case Encoding::PLAIN:
+        dict_size = static_cast<int32_t>(end - data);
+        str_bytes = is_bounds_pg ? totalPlainEntriesSize(data, dict_size, start_value, end_value)
+                                 : dict_size - sizeof(int) * pp->num_valids;
+        break;
+    }
   }
 
   if (t == 0) {
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index fd971e342c0..0a8640aef26 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -452,8 +452,11 @@ struct EncPage {
  */
 constexpr bool is_string_col(ColumnChunkDesc const& chunk)
 {
-  return (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4 and
-         chunk.converted_type != DECIMAL;
+  auto const not_converted_to_decimal = chunk.converted_type != DECIMAL;
+  auto const non_hashed_byte_array =
+    (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4;
+  auto const fixed_len_byte_array = (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY;
+  return not_converted_to_decimal and (non_hashed_byte_array or fixed_len_byte_array);
 }
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 3f58fc8d42d..9289ddb91b3 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -401,9 +401,15 @@ table_with_metadata reader::impl::read_chunk_internal(
 
   // Create the final output cudf columns.
   for (size_t i = 0; i < _output_buffers.size(); ++i) {
-    auto const metadata = _reader_column_schema.has_value()
-                            ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
-                            : std::nullopt;
+    auto metadata      = _reader_column_schema.has_value()
+                           ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
+                           : std::nullopt;
+    auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
+    // FIXED_LEN_BYTE_ARRAY never read as string
+    if (schema.type == FIXED_LEN_BYTE_ARRAY and schema.converted_type != DECIMAL) {
+      metadata = std::make_optional<reader_column_schema>();
+      metadata->set_convert_binary_to_strings(false);
+    }
     // Only construct `out_metadata` if `_output_metadata` has not been cached.
     if (!_output_metadata) {
       column_name_info& col_name = out_metadata.schema_info[i];
diff --git a/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet b/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b0ee8f2e4d2dd92b217bbe825417b83b36b03735
GIT binary patch
literal 259
zcmYLE!D_=W3>78=pE?GWiNJ?k7BXl{ZFf!!W00Q4PGvt(8rOx;t=p1x-?cyJai6qn
zRdyIcNRKDIC#|X%gMgCb`}FgBtugt70M*i1x)GELuBoV&C`bX2RuVloz<Bg@)qQN*
zJqp0VWi^#`b6EGW>p>-?N#|1IfNnw3KZVRGuY4{aJQS>av0!0#W>EmO#i4q{9C#?n
zrSe07;+cIukFg)Wwb@r}yXO6nnTfFzn-%r3dEI7Z8QnC@rwixiK8AZ6T-a@VaI?CM
VJLjx%)`rFGj<n|UPn;`${|vCvF>U|=

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 2cc4b32443d..66c4a253423 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2568,6 +2568,15 @@ def test_parquet_reader_binary_decimal(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_fixed_bin(datadir):
+    fname = datadir / "fixed_len_byte_array.parquet"
+
+    expect = pd.read_parquet(fname)
+    got = cudf.read_parquet(fname)
+
+    assert_eq(expect, got)
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"
 

From 384b33f6eb3627091aa6bdc3dd6938ba87739cbe Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 25 Aug 2023 10:01:18 -0500
Subject: [PATCH 099/230] Fix setting of categories order when `dtype` is
 passed to a `CategoricalColumn` (#13955)

closes #13947

This PR fixes the setting of categories in the appropriate order when a dtype with the same categories but of mismatching order is passed.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13955
---
 python/cudf/cudf/core/column/categorical.py |  2 +-
 python/cudf/cudf/tests/test_categorical.py  | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9a1fab372a6..eaffc18db70 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1483,7 +1483,7 @@ def set_categories(
                     ),
                 )
             elif (
-                not out_col._categories_equal(new_categories, ordered=ordered)
+                not out_col._categories_equal(new_categories, ordered=True)
                 or not self.ordered == ordered
             ):
                 out_col = out_col._set_categories(
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 2c8226e4fe5..8b3d75fe59e 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -926,3 +926,13 @@ def test_categorical_string_index_contains(data, value):
     pidx = idx.to_pandas()
 
     assert_eq(value in idx, value in pidx)
+
+
+def test_categorical_index_with_dtype():
+    dtype = cudf.CategoricalDtype(categories=["a", "z", "c"])
+    gi = cudf.Index(["z", "c", "a"], dtype=dtype)
+    pi = pd.Index(["z", "c", "a"], dtype=dtype.to_pandas())
+
+    assert_eq(gi, pi)
+    assert_eq(gi.dtype, pi.dtype)
+    assert_eq(gi.dtype.categories, pi.dtype.categories)

From aef903c15cced79906ab8a0ccce11644f1fd6de6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 25 Aug 2023 10:09:04 -0500
Subject: [PATCH 100/230] Add pandas compatible output to `Series.unique`
 (#13959)

Partially addresses #8175
This PR makes changes to `Series.unique`, where a cupy array is returned to match `pd.Series.unique` where a numpy array is returned.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13959
---
 python/cudf/cudf/core/series.py       | 2 ++
 python/cudf/cudf/tests/test_series.py | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index b63261ef840..30d584c2270 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2970,6 +2970,8 @@ def unique(self):
         dtype: object
         """
         res = self._column.unique()
+        if cudf.get_option("mode.pandas_compatible"):
+            return res.values
         return Series(res, name=self.name)
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index db1249213f8..51c6bb1634d 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2254,3 +2254,12 @@ def test_series_nlargest_nsmallest_str_error(attr):
     assert_exceptions_equal(
         getattr(gs, attr), getattr(ps, attr), ([], {"n": 1}), ([], {"n": 1})
     )
+
+
+def test_series_unique_pandas_compatibility():
+    gs = cudf.Series([10, 11, 12, 11, 10])
+    ps = gs.to_pandas()
+    with cudf.option_context("mode.pandas_compatible", True):
+        actual = gs.unique()
+    expected = ps.unique()
+    assert_eq(actual, expected)

From 5d5f4f5ff8bfb3b9989de8d25a7edfccfd8a4ba8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 25 Aug 2023 12:34:16 -0500
Subject: [PATCH 101/230] Fix type metadata issue preservation with
 `Column.unique` (#13957)

closes #13953

This PR fixes an issue with `Column.unique` where the type-metadata wasn't being preserved in the end before returning the unique values. This lead to `IntervalColumn` being returned as a `StructColumn`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/13957
---
 python/cudf/cudf/core/column/column.py  |  4 +++-
 python/cudf/cudf/tests/test_interval.py | 22 +++++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c1ad5de1181..446f01ef419 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1162,7 +1162,9 @@ def unique(self) -> ColumnBase:
         """
         Get unique values in the data
         """
-        return drop_duplicates([self], keep="first")[0]
+        return drop_duplicates([self], keep="first")[0]._with_type_metadata(
+            self.dtype
+        )
 
     def serialize(self) -> Tuple[dict, list]:
         # data model:
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index e1104829914..18454172289 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -1,5 +1,7 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -132,3 +134,21 @@ def test_create_interval_df(data1, data2, data3, data4, closed):
         dtype="interval",
     )
     assert_eq(expect_three, got_three)
+
+
+def test_interval_index_unique():
+    interval_list = [
+        np.nan,
+        pd.Interval(2.0, 3.0, closed="right"),
+        pd.Interval(3.0, 4.0, closed="right"),
+        np.nan,
+        pd.Interval(3.0, 4.0, closed="right"),
+        pd.Interval(3.0, 4.0, closed="right"),
+    ]
+    pi = pd.Index(interval_list)
+    gi = cudf.from_pandas(pi)
+
+    expected = pi.unique()
+    actual = gi.unique()
+
+    assert_eq(expected, actual)

From 80d9b1aabd8caeca1d40318dc427d898f2608e21 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Fri, 25 Aug 2023 12:52:54 -0500
Subject: [PATCH 102/230] Handle `as_index` in `GroupBy.apply` (#13951)

Closes https://github.com/rapidsai/cudf/issues/13897

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/13951
---
 python/cudf/cudf/core/groupby/groupby.py |  3 +++
 python/cudf/cudf/tests/test_groupby.py   | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 63c9dd837a8..cf4c861c28f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1436,6 +1436,9 @@ def mult(df):
 
         if self._sort:
             result = result.sort_index()
+        if self._as_index is False:
+            result = result.reset_index()
+            result[None] = result.pop(0)
         return result
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 784cabaa542..b48ce210104 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -180,6 +180,16 @@ def test_groupby_as_index_single_agg(pdf, gdf, as_index):
     assert_groupby_results_equal(pdf, gdf)
 
 
+@pytest.mark.parametrize("engine", ["cudf", "jit"])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_groupby_as_index_apply(pdf, gdf, as_index, engine):
+    gdf = gdf.groupby("y", as_index=as_index).apply(
+        lambda df: df["x"].mean(), engine=engine
+    )
+    pdf = pdf.groupby("y", as_index=as_index).apply(lambda df: df["x"].mean())
+    assert_groupby_results_equal(pdf, gdf)
+
+
 @pytest.mark.parametrize("as_index", [True, False])
 def test_groupby_as_index_multiindex(pdf, gdf, as_index):
     pdf = pd.DataFrame(

From 4591dd3f14701da061872eb868641964383fece5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 25 Aug 2023 14:18:45 -0400
Subject: [PATCH 103/230] Use cudf::size_type instead of int32 where
 appropriate in nvtext functions (#13915)

Updates code to use `size_type` instead of `int32_t` where appropriate (i.e. offsets).
Also changes some code logic for resolving a thread-index in a custom kernel to use the `cudf::thread_index_type` to help avoid overflow of 32-bit integer types.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/13915
---
 cpp/src/text/normalize.cu                     | 18 ++++----
 cpp/src/text/replace.cu                       |  2 +-
 cpp/src/text/stemmer.cu                       | 26 ++++++-----
 cpp/src/text/subword/data_normalizer.cu       | 45 ++++++++++---------
 .../text/subword/detail/data_normalizer.hpp   | 10 +++--
 .../text/subword/detail/tokenizer_utils.cuh   |  9 ++--
 .../subword/detail/wordpiece_tokenizer.hpp    |  4 +-
 cpp/src/text/subword/subword_tokenize.cu      | 21 +++++----
 cpp/src/text/subword/wordpiece_tokenizer.cu   | 26 ++++++-----
 cpp/src/text/tokenize.cu                      | 44 +++++++++---------
 cpp/src/text/utilities/tokenize_ops.cuh       |  4 +-
 11 files changed, 118 insertions(+), 91 deletions(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index bc2b0607193..78dfb6bf1a6 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -107,8 +107,8 @@ constexpr uint32_t UTF8_3BYTE = 0x01'0000;
 struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
-  int32_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  int32_t* d_offsets{};                      // offsets for the output strings
+  cudf::size_type const* d_cp_offsets{};     // offsets to each string's code-point array
+  cudf::size_type* d_offsets{};              // offsets for the output strings
   char* d_chars{};                           // buffer for the output strings column
 
   /**
@@ -118,7 +118,7 @@ struct codepoint_to_utf8_fn {
    * @param count number of code-points in `str_cps`
    * @return Number of bytes required for the output
    */
-  __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count)
+  __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count)
   {
     return thrust::transform_reduce(
       thrust::seq,
@@ -126,7 +126,7 @@ struct codepoint_to_utf8_fn {
       str_cps + count,
       [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); },
       0,
-      thrust::plus<int32_t>());
+      thrust::plus());
   }
 
   __device__ void operator()(cudf::size_type idx)
@@ -208,9 +208,9 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     auto const aux_table   = get_aux_codepoint_data(stream);
     auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
     auto const offsets     = strings.offsets();
-    auto const d_offsets   = offsets.data<uint32_t>() + strings.offset();
-    auto const offset      = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-    auto const d_chars     = strings.chars().data<char>() + offset;
+    auto const d_offsets   = offsets.data<cudf::size_type>() + strings.offset();
+    auto const offset = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
+    auto const d_chars = strings.chars().data<char>() + offset;
     return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
   }();
 
@@ -222,8 +222,8 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   // convert the result into a strings column
   // - the cp_chars are the new 4-byte code-point values for all the characters in the output
   // - the cp_offsets identify which code-points go with which strings
-  uint32_t const* cp_chars  = result.first->data();
-  int32_t const* cp_offsets = reinterpret_cast<int32_t const*>(result.second->data());
+  uint32_t const* cp_chars          = result.first->data();
+  cudf::size_type const* cp_offsets = result.second->data();
 
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 3cfaece64d7..d122f048a4e 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -47,7 +47,7 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
 struct base_token_replacer_fn {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
-  int32_t* d_offsets{};                      ///< for locating output string in d_chars
+  cudf::size_type* d_offsets{};              ///< for locating output string in d_chars
   char* d_chars{};                           ///< output buffer
 
   /**
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 6aad75bef71..2b2b8429d9c 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -184,17 +184,19 @@ struct dispatch_is_letter_fn {
 struct porter_stemmer_measure_fn {
   cudf::column_device_view const d_strings;  // strings to measure
 
-  __device__ int32_t operator()(cudf::size_type idx) const
+  __device__ cudf::size_type operator()(cudf::size_type idx) const
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) { return 0; }
     cudf::string_view d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return 0;
-    int32_t measure = 0;
-    auto itr        = d_str.begin();
-    bool vowel_run  = !is_consonant(itr);
+    if (d_str.empty()) { return 0; }
+
+    cudf::size_type measure = 0;
+
+    auto itr       = d_str.begin();
+    bool vowel_run = !is_consonant(itr);
     while (itr != d_str.end()) {
       if (is_consonant(itr)) {
-        if (vowel_run) measure++;
+        if (vowel_run) { measure++; }
         vowel_run = false;
       } else {
         vowel_run = true;
@@ -211,11 +213,13 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+  if (strings.is_empty()) {
+    return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  }
 
   // create empty output column
   auto results =
-    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                   strings.size(),
                                   cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                   strings.null_count(),
@@ -226,7 +230,7 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings.size()),
-                    results->mutable_view().data<int32_t>(),
+                    results->mutable_view().data<cudf::size_type>(),
                     porter_stemmer_measure_fn{*strings_column});
   results->set_null_count(strings.null_count());
   return results;
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 596c8d26e65..34eb95bea5c 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -124,9 +124,10 @@ __device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6)
  * @param start_byte_for_thread Which byte to start analyzing
  * @return New code point value for this byte.
  */
-__device__ uint32_t extract_code_points_from_utf8(unsigned char const* strings,
-                                                  size_t const total_bytes,
-                                                  uint32_t const start_byte_for_thread)
+__device__ uint32_t
+extract_code_points_from_utf8(unsigned char const* strings,
+                              size_t const total_bytes,
+                              cudf::thread_index_type const start_byte_for_thread)
 {
   constexpr uint8_t max_utf8_blocks_for_char    = 4;
   uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};
@@ -214,8 +215,9 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
   constexpr uint32_t init_val                     = (1 << FILTER_BIT);
   uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};
 
-  uint32_t const char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
-  uint32_t num_new_chars         = 0;
+  cudf::thread_index_type const char_for_thread =
+    threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
+  uint32_t num_new_chars = 0;
 
   if (char_for_thread < total_bytes) {
     auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);
@@ -273,31 +275,34 @@ data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
 }
 
 uvector_pair data_normalizer::normalize(char const* d_strings,
-                                        uint32_t const* d_offsets,
-                                        uint32_t num_strings,
+                                        cudf::size_type const* d_offsets,
+                                        cudf::size_type num_strings,
                                         rmm::cuda_stream_view stream) const
 {
-  if (num_strings == 0)
-    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+  if (num_strings == 0) {
+    return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                        std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
+  }
 
   // copy offsets to working memory
-  size_t const num_offsets = num_strings + 1;
-  auto d_strings_offsets   = std::make_unique<rmm::device_uvector<uint32_t>>(num_offsets, stream);
+  auto const num_offsets = num_strings + 1;
+  auto d_strings_offsets =
+    std::make_unique<rmm::device_uvector<cudf::size_type>>(num_offsets, stream);
   thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<uint32_t>(0),
-                    thrust::make_counting_iterator<uint32_t>(num_offsets),
+                    thrust::counting_iterator<cudf::size_type>(0),
+                    thrust::counting_iterator<cudf::size_type>(num_offsets),
                     d_strings_offsets->begin(),
                     [d_offsets] __device__(auto idx) {
                       auto const offset = d_offsets[0];  // adjust for any offset to the offsets
                       return d_offsets[idx] - offset;
                     });
-  uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream);
-  if (bytes_count == 0)  // if no bytes, nothing to do
-    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+  auto const bytes_count = d_strings_offsets->element(num_strings, stream);
+  if (bytes_count == 0) {  // if no bytes, nothing to do
+    return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                        std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
+  }
 
-  cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(bytes_count), THREADS_PER_BLOCK, 1};
+  cudf::detail::grid_1d const grid{bytes_count, THREADS_PER_BLOCK, 1};
   size_t const threads_on_device  = grid.num_threads_per_block * grid.num_blocks;
   size_t const max_new_char_total = MAX_NEW_CHARS * threads_on_device;
 
@@ -333,7 +338,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
     num_strings,
     update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()});
 
-  uint32_t const num_chars = d_strings_offsets->element(num_strings, stream);
+  auto const num_chars = d_strings_offsets->element(num_strings, stream);
   d_code_points->resize(num_chars, stream);  // should be smaller than original allocated size
 
   // return the normalized code points and the new offsets
diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp
index 927de5a74f9..fb507b88e7e 100644
--- a/cpp/src/text/subword/detail/data_normalizer.hpp
+++ b/cpp/src/text/subword/detail/data_normalizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,13 @@
 
 #include <text/subword/detail/cp_data.h>
 
+#include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 using uvector_pair = std::pair<std::unique_ptr<rmm::device_uvector<uint32_t>>,
-                               std::unique_ptr<rmm::device_uvector<uint32_t>>>;
+                               std::unique_ptr<rmm::device_uvector<cudf::size_type>>>;
 
 namespace nvtext {
 namespace detail {
@@ -85,8 +87,8 @@ class data_normalizer {
    *         used to locate the code points for each string.
    */
   uvector_pair normalize(char const* d_strings,
-                         uint32_t const* d_offsets,
-                         uint32_t num_strings,
+                         cudf::size_type const* d_offsets,
+                         cudf::size_type num_strings,
                          rmm::cuda_stream_view stream) const;
 
  private:
diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh
index 5e8de1ba244..7cc0e7c0e24 100644
--- a/cpp/src/text/subword/detail/tokenizer_utils.cuh
+++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <text/subword/detail/cp_data.h>
 
+#include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -47,8 +49,9 @@ constexpr int THREADS_PER_BLOCK = 64;
  */
 struct update_strings_lengths_fn {
   uint32_t const* d_chars_up_to_idx;
-  uint32_t* d_offsets;
-  __device__ void operator()(uint32_t idx)
+  cudf::size_type* d_offsets;
+
+  __device__ void operator()(cudf::size_type idx)
   {
     auto const offset = d_offsets[idx];
     d_offsets[idx]    = offset > 0 ? d_chars_up_to_idx[offset - 1] : 0;
diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
index 2f528dce897..e191890eeca 100644
--- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
+++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
@@ -82,8 +82,8 @@ class wordpiece_tokenizer {
    * @return Pointer to token-ids and token-id offsets
    */
   uvector_pair tokenize(char const* d_strings,
-                        uint32_t const* d_offsets,
-                        uint32_t num_strings,
+                        cudf::size_type const* d_offsets,
+                        cudf::size_type num_strings,
                         rmm::cuda_stream_view stream);
 
  private:
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 47f602362f2..1a3084a257f 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -59,7 +59,7 @@ namespace {
 __global__ void kernel_compute_tensor_metadata(
   // input
   uint32_t const* token_ids,
-  uint32_t const* offsets,
+  cudf::size_type const* offsets,
   uint32_t const* row2tensor,
   uint32_t const* row2row_within_tensor,
   uint32_t max_sequence_length,
@@ -71,8 +71,13 @@ __global__ void kernel_compute_tensor_metadata(
   uint32_t* attn_mask,
   uint32_t* metadata)
 {
-  uint32_t const output_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (output_idx >= (nrows_tensor_token_ids * max_sequence_length)) return;
+  cudf::thread_index_type const output_idx =
+    threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) *
+                    static_cast<cudf::thread_index_type>(blockDim.x);
+  if (output_idx >= (static_cast<cudf::thread_index_type>(nrows_tensor_token_ids) *
+                     static_cast<cudf::thread_index_type>(max_sequence_length))) {
+    return;
+  }
 
   uint32_t const absolute_row_id         = output_idx / max_sequence_length;
   uint32_t const tensor_id               = row2tensor[absolute_row_id];
@@ -179,9 +184,9 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
     std::overflow_error);
 
   auto const offsets   = strings.offsets();
-  auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
-  auto const offset    = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-  auto const d_chars   = strings.chars().data<char>() + offset;
+  auto const d_offsets = offsets.data<cudf::size_type>() + strings.offset();
+  auto const offset  = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
+  auto const d_chars = strings.chars().data<char>() + offset;
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
@@ -189,8 +194,8 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
   // Run tokenizer
   auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
   // assign output components
-  uint32_t const* device_token_ids = tokens.first->data();
-  uint32_t const* device_offsets   = tokens.second->data();
+  auto device_token_ids = tokens.first->data();
+  auto device_offsets   = tokens.second->data();
 
   // Format output from tokenizer
   // Each string can create 1 or more tensor entries.
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index b6f6b7eda25..3b912017320 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -82,7 +82,9 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
                                                        uint32_t* token_ids,
                                                        uint8_t* tokens_per_word)
 {
-  uint32_t char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                              static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                            threadIdx.x;
 
   // Deal with the start_word_indices array
   if (char_for_thread < num_code_points) {
@@ -130,12 +132,14 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
  * @param num_strings The total number of strings to be processed.
  */
 __global__ void mark_string_start_and_ends(uint32_t const* code_points,
-                                           uint32_t const* strings_offsets,
+                                           cudf::size_type const* strings_offsets,
                                            uint32_t* start_word_indices,
                                            uint32_t* end_word_indices,
                                            uint32_t num_strings)
 {
-  uint32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                  static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                threadIdx.x;
   // Ensure the starting character of each strings is written to the word start array.
   if (idx <= num_strings) {
     auto const offset = strings_offsets[idx];
@@ -330,7 +334,9 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
                                            uint32_t* token_ids,
                                            uint8_t* tokens_per_word)
 {
-  uint32_t const word_to_tokenize = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                               static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                             threadIdx.x;
 
   if (word_to_tokenize >= total_words) return;
   // Each thread gets the start code_point offset for each word and resets the token_id memory to
@@ -414,8 +420,8 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
 }
 
 uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings,
-                                           uint32_t const* d_offsets,
-                                           uint32_t num_strings,
+                                           cudf::size_type const* d_offsets,
+                                           cudf::size_type num_strings,
                                            rmm::cuda_stream_view stream)
 {
   auto cps_and_offsets = normalizer.normalize(d_strings, d_offsets, num_strings, stream);
@@ -433,10 +439,10 @@ struct tranform_fn {  // just converting uint8 value to uint32
 
 void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream)
 {
-  uint32_t* device_code_points     = cps_and_offsets.first->data();
-  size_t const num_code_points     = cps_and_offsets.first->size();
-  uint32_t* device_strings_offsets = cps_and_offsets.second->data();
-  uint32_t const num_strings       = cps_and_offsets.second->size() - 1;
+  auto device_code_points     = cps_and_offsets.first->data();
+  auto const num_code_points  = cps_and_offsets.first->size();
+  auto device_strings_offsets = cps_and_offsets.second->data();
+  auto const num_strings      = cps_and_offsets.second->size() - 1;
 
   size_t const four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t);
   size_t const rounded_num_cps     = sizeof(uint32_t) * four_byte_cp_chunks;
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 8604152099c..16b9f25b802 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -52,12 +52,13 @@ std::unique_ptr<cudf::column> token_count_fn(cudf::size_type strings_count,
                                              rmm::mr::device_memory_resource* mr)
 {
   // create output column
-  auto token_counts   = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
-                                                strings_count,
-                                                cudf::mask_state::UNALLOCATED,
-                                                stream,
-                                                mr);
-  auto d_token_counts = token_counts->mutable_view().data<int32_t>();
+  auto token_counts =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                              strings_count,
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+  auto d_token_counts = token_counts->mutable_view().data<cudf::size_type>();
   // add the counts to the column
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -79,10 +80,10 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
     token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource());
   auto d_token_counts = token_counts->view();
   // create token-index offsets from the counts
-  rmm::device_uvector<int32_t> token_offsets(strings_count + 1, stream);
+  rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
   thrust::inclusive_scan(rmm::exec_policy(stream),
-                         d_token_counts.template begin<int32_t>(),
-                         d_token_counts.template end<int32_t>(),
+                         d_token_counts.template begin<cudf::size_type>(),
+                         d_token_counts.template end<cudf::size_type>(),
                          token_offsets.begin() + 1);
   token_offsets.set_element_to_zero_async(0, stream);
   auto const total_tokens = token_offsets.back_element(stream);
@@ -177,10 +178,10 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   }
 
   auto offsets = strings_column.offsets();
-  auto offset  = cudf::detail::get_value<int32_t>(offsets, strings_column.offset(), stream);
-  auto chars_bytes =
-    cudf::detail::get_value<int32_t>(offsets, strings_column.offset() + strings_count, stream) -
-    offset;
+  auto offset  = cudf::detail::get_value<cudf::size_type>(offsets, strings_column.offset(), stream);
+  auto chars_bytes = cudf::detail::get_value<cudf::size_type>(
+                       offsets, strings_column.offset() + strings_count, stream) -
+                     offset;
   auto d_chars = strings_column.chars().data<uint8_t>();  // unsigned is necessary for checking bits
   d_chars += offset;
 
@@ -200,16 +201,17 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // create output offsets column
   // -- conditionally copy a counting iterator where
   //    the first byte of each character is located
-  auto offsets_column = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
-                                                  num_characters + 1,
-                                                  cudf::mask_state::UNALLOCATED,
-                                                  stream,
-                                                  mr);
-  auto d_new_offsets  = offsets_column->mutable_view().begin<int32_t>();
+  auto offsets_column =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                              num_characters + 1,
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+  auto d_new_offsets = offsets_column->mutable_view().begin<cudf::size_type>();
   thrust::copy_if(
     rmm::exec_policy(stream),
-    thrust::make_counting_iterator<int32_t>(0),
-    thrust::make_counting_iterator<int32_t>(chars_bytes + 1),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(chars_bytes + 1),
     d_new_offsets,
     [d_chars, chars_bytes] __device__(auto idx) {
       // this will also set the final value to the size chars_bytes
diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh
index 89825e31e5c..fbd2d1efcff 100644
--- a/cpp/src/text/utilities/tokenize_ops.cuh
+++ b/cpp/src/text/utilities/tokenize_ops.cuh
@@ -149,7 +149,7 @@ struct characters_tokenizer {
 struct strings_tokenizer {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters to tokenize around
-  int32_t* d_offsets{};                      ///< offsets into the d_tokens vector for each string
+  cudf::size_type* d_offsets{};              ///< offsets into the d_tokens vector for each string
   string_index_pair* d_tokens{};             ///< token positions in device memory
 
   /**
@@ -194,7 +194,7 @@ struct multi_delimiter_strings_tokenizer {
   cudf::column_device_view const d_strings;  ///< strings column to tokenize
   delimiterator delimiters_begin;            ///< first delimiter
   delimiterator delimiters_end;              ///< last delimiter
-  int32_t* d_offsets{};                      ///< offsets into the d_tokens output vector
+  cudf::size_type* d_offsets{};              ///< offsets into the d_tokens output vector
   string_index_pair* d_tokens{};             ///< token positions found for each string
 
   /**

From ec1e73f8d04563c95fb5e0eb775c2e8c65ee0d64 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 25 Aug 2023 13:49:28 -0500
Subject: [PATCH 104/230] Fix an issue with `IntervalIndex.repr` when null
 values are present (#13958)

closes #13954

This PR fixes an issue with `IntervalIndex.repr`, where there was a silent failure because of no dedicated `_clean_nulls_from_index` and the `GenericIndex._clean_nulls_from_index` wouldn't work because a type-cast to `str` isn't implemented.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13958
---
 python/cudf/cudf/core/dtypes.py     |  5 ++++-
 python/cudf/cudf/core/index.py      |  5 ++++-
 python/cudf/cudf/tests/test_repr.py | 13 +++++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index a83c1f7b3c9..5fb092c7cbc 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -905,9 +905,12 @@ def __init__(self, subtype, closed="right"):
     def subtype(self):
         return self.fields["left"]
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"interval[{self.subtype}, {self.closed}]"
 
+    def __str__(self) -> str:
+        return self.__repr__()
+
     @classmethod
     def from_arrow(cls, typ):
         return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 458ef2df02d..c7e25cdc430 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3187,7 +3187,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
         >>> import cudf
         >>> import pandas as pd
         >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3])
-        IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval')
+        IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]')
         """
         if copy:
             breaks = column.as_column(breaks, dtype=dtype).copy()
@@ -3211,6 +3211,9 @@ def _is_interval(self):
     def _is_boolean(self):
         return False
 
+    def _clean_nulls_from_index(self):
+        return self
+
 
 class StringIndex(GenericIndex):
     """String defined indices into another Column
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index b944e0483d0..a36cc1b3819 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1469,3 +1469,16 @@ def test_repr_struct_after_concat():
     pdf = df.to_pandas()
 
     assert repr(df) == repr(pdf)
+
+
+def test_interval_index_repr():
+    pi = pd.Index(
+        [
+            np.nan,
+            pd.Interval(2.0, 3.0, closed="right"),
+            pd.Interval(3.0, 4.0, closed="right"),
+        ]
+    )
+    gi = cudf.from_pandas(pi)
+
+    assert repr(pi) == repr(gi)

From 6d10a82076ffbd5530d0d0b5f4c6277d2a4f9d7a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 25 Aug 2023 14:35:48 -0500
Subject: [PATCH 105/230] Fix `MultiIndex.to_numpy` to return numpy array with
 tuples (#13966)

closes #13961

The absence of `MultiIndex.to_numpy` resulted in calling of `Frame.to_numpy` that returns a numpy array without any tuples, this PR adds `MultiIndex.to_numpy` so that a numpy array of tuples is returned.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13966
---
 python/cudf/cudf/core/multiindex.py       |  4 ++++
 python/cudf/cudf/tests/test_multiindex.py | 11 +++++++++++
 2 files changed, 15 insertions(+)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 5ab9af36175..eb953a54f6b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1151,6 +1151,10 @@ def from_tuples(cls, tuples, names=None):
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         return cls.from_pandas(pdi)
 
+    @_cudf_nvtx_annotate
+    def to_numpy(self):
+        return self.values_host
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def values_host(self):
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index a4099bb7f88..464b9623bad 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1182,6 +1182,17 @@ def test_multiindex_values_host():
     assert_eq(midx.values_host, pmidx.values)
 
 
+def test_multiindex_to_numpy():
+    midx = cudf.MultiIndex(
+        levels=[[1, 3, 4, 5], [1, 2, 5]],
+        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
+        names=["x", "y"],
+    )
+    pmidx = midx.to_pandas()
+
+    assert_eq(midx.to_numpy(), pmidx.to_numpy())
+
+
 @pytest.mark.parametrize(
     "gdi, fill_value, expected",
     [

From 89787f24b957408d051791ebe725d5eee30c4814 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 25 Aug 2023 14:44:12 -0500
Subject: [PATCH 106/230] Handle `Interval` scalars when passed in list-like
 inputs to `cudf.Index` (#13956)

closes #13952

This PR fixes an issue with `IntervalColumn` construction where we can utilize the existing type inference to create a pandas Series and then construct an `IntervalColumn` out of it since pyarrow is unable to read this kind of input correctly.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13956
---
 python/cudf/cudf/core/column/column.py  | 10 +++++++---
 python/cudf/cudf/tests/test_interval.py | 13 +++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 446f01ef419..eafcc18450d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2454,25 +2454,29 @@ def as_column(
 
 def _construct_array(
     arbitrary: Any, dtype: Optional[Dtype]
-) -> Union[np.ndarray, cupy.ndarray]:
+) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]:
     """
-    Construct a CuPy or NumPy array from `arbitrary`
+    Construct a CuPy/NumPy/Pandas array from `arbitrary`
     """
     try:
         dtype = dtype if dtype is None else cudf.dtype(dtype)
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype
+        inferred_dtype = None
         if (
             dtype is None
             and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and infer_dtype(arbitrary, skipna=False)
+            and (inferred_dtype := infer_dtype(arbitrary, skipna=False))
             in (
                 "mixed",
                 "mixed-integer",
             )
         ):
             native_dtype = "object"
+        if inferred_dtype == "interval":
+            # Only way to construct an Interval column.
+            return pd.array(arbitrary)
         arbitrary = np.asarray(
             arbitrary,
             dtype=native_dtype
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 18454172289..f2e8f585a69 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -136,6 +136,19 @@ def test_create_interval_df(data1, data2, data3, data4, closed):
     assert_eq(expect_three, got_three)
 
 
+def test_create_interval_index_from_list():
+    interval_list = [
+        np.nan,
+        pd.Interval(2.0, 3.0, closed="right"),
+        pd.Interval(3.0, 4.0, closed="right"),
+    ]
+
+    expected = pd.Index(interval_list)
+    actual = cudf.Index(interval_list)
+
+    assert_eq(expected, actual)
+
+
 def test_interval_index_unique():
     interval_list = [
         np.nan,

From b6d08cae87aa489706a1fc1eefde5c1efe3f3ebf Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 25 Aug 2023 15:31:41 -0500
Subject: [PATCH 107/230] Switch pylibcudf-enabled types to use enum class in
 Cython (#13931)

This PR leverages Cython 3's support for C++'s scoped enumerations to more faithfully translate libcudf types into cuDF Cython and pylibcudf. Due to some Cython 3 limitations, there are a few workarounds in this PR. However, they are relatively minor and can be adjusted later as Cython improves. In the meantime, the change here is an improvement still worth merging, especially since it sets a template for how pylibcudf code should look as more of it emerges. Existing cuDF Cython is only updated to the minimum extent required for it to compile. Fully switching the old code to use enum class-style syntax isn't worthwhile since those internals should eventually be migrated to use pylibcudf in pure Python mode anyway.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13931
---
 docs/cudf/source/developer_guide/pylibcudf.md | 66 +++++++++++++++----
 python/cudf/cudf/_lib/CMakeLists.txt          |  1 +
 python/cudf/cudf/_lib/cpp/CMakeLists.txt      | 23 +++++++
 python/cudf/cudf/_lib/cpp/copying.pxd         |  6 +-
 python/cudf/cudf/_lib/cpp/copying.pyx         |  0
 python/cudf/cudf/_lib/cpp/types.pxd           | 65 +++++++++---------
 python/cudf/cudf/_lib/cpp/types.pyx           |  0
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  5 +-
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |  5 +-
 python/cudf/cudf/_lib/pylibcudf/copying.pxd   | 20 +-----
 python/cudf/cudf/_lib/pylibcudf/copying.pyx   | 20 +++---
 python/cudf/cudf/_lib/pylibcudf/types.pxd     | 55 +---------------
 python/cudf/cudf/_lib/pylibcudf/types.pyx     | 16 ++---
 python/cudf/cudf/_lib/scalar.pyx              | 56 ++++++++--------
 .../strings/convert/convert_fixed_point.pyx   | 12 ++--
 python/cudf/cudf/_lib/types.pyx               |  1 +
 16 files changed, 177 insertions(+), 174 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/cpp/copying.pyx
 create mode 100644 python/cudf/cudf/_lib/cpp/types.pyx

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 4940e329653..1b321dbb1fe 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -86,7 +86,7 @@ cpdef Table gather(
             cpp_copying.gather(
                 source_table.view(),
                 gather_map.view(),
-                py_policy_to_c_policy(bounds_policy)
+                bounds_policy
             )
         )
     return Table.from_libcudf(move(c_result))
@@ -94,18 +94,62 @@ cpdef Table gather(
 
 There are a couple of notable points from the snippet above:
 - The object returned from libcudf is immediately converted to a pylibcudf type.
-- `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter, which is mirrored by the `cdef `class OutOfBoundsPolicy` as mentioned in [the data structures example above](data-structures).
+- `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case).
 
 ## Miscellaneous Notes
 
-### Cython Scoped Enums and Casting
-Cython does not support scoped enumerations.
-It assumes that enums correspond to their underlying value types and will thus attempt operations that are invalid.
-To fix this, many places in pylibcudf Cython code contain double casts that look like
+### Cython Scoped Enums
+Cython 3 introduced support for scoped enumerations.
+However, this support has some bugs as well as some easy pitfalls.
+Our usage of enums is intended to minimize the complexity of our code while also working around Cython's limitations.
+
+```{warning}
+The guidance in this section may change often as Cython is updated and our understanding of best practices evolves.
+```
+
+- All pxd files that declare a C++ enum should use `cpdef enum class` declarations.
+  -  Reason: This declaration makes the C++ enum available in Cython code while also transparently creating a Python enum.
+- Any pxd file containing only C++ declarations must still have a corresponding pyx file if any of the declarations are scoped enums.
+  - Reason: The creation of the Python enum requires that Cython actually generate the necessary Python C API code, which will not happen if only a pxd file is present.
+-  If a C++ enum will be part of a pylibcudf module's public API, then it should be imported (not cimported) directly into the pyx file and aliased with a name that matches our Python class naming conventions (CapsCase) instead of our C++ naming convention (snake\_case).
+  - Reason: We want to expose the enum to both Python and Cython consumers of the module. As a side effect, this aliasing avoids [this Cython bug](https://github.com/cython/cython/issues/5609).
+  - Note: Once the above Cython bug is resolved, the enum should also be aliased into the pylibcudf pxd file when it is cimported so that Python and Cython usage will match.
+
+Here is an example of appropriate enum usage.
+
+
 ```cython
-return <cpp_type> (
-    <underlying_type_t_cpp_type> py_policy
-)
+# cpp/copying.pxd
+cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
+    # cpdef here so that we export both a cdef enum class and a Python enum.Enum.
+    cpdef enum class out_of_bounds_policy(bool):
+        NULLIFY
+        DONT_CHECK
+
+
+# cpp/copying.pyx
+# This file is empty, but is required to compile the Python enum in cpp/copying.pxd
+
+
+# pylibcudf/copying.pxd
+
+# cimport the enum using the exact name
+# Once https://github.com/cython/cython/issues/5609 is resolved,
+# this import should instead be
+# from cudf._lib.cpp.copying cimport out_of_bounds_policy as OutOfBoundsPolicy
+from cudf._lib.cpp.copying cimport out_of_bounds_policy
+
+
+# pylibcudf/copying.pyx
+# Access cpp.copying members that aren't part of this module's public API via
+# this module alias
+from cudf._lib.cpp cimport copying as cpp_copying
+from cudf._lib.cpp.copying cimport out_of_bounds_policy
+
+# This import exposes the enum in the public API of this module.
+# It requires a no-cython-lint tag because it will be unused: all typing of
+# parameters etc will need to use the Cython name `out_of_bounds_policy` until
+# the Cython bug is resolved.
+from cudf._lib.cpp.copying import \
+    out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
 ```
-where `cpp_type` is some libcudf enum with a specified underlying type.
-This double-cast will be removed when we migrate to Cython 3, which adds proper support for C++ scoped enumerations.
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 37544e1c7cd..06de6cc825f 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -106,6 +106,7 @@ foreach(target IN LISTS targets_using_arrow_headers)
   target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
 endforeach()
 
+add_subdirectory(cpp)
 add_subdirectory(io)
 add_subdirectory(nvtext)
 add_subdirectory(pylibcudf)
diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/cpp/CMakeLists.txt
new file mode 100644
index 00000000000..a99aa58dfe8
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/CMakeLists.txt
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources copying.pyx types.pyx)
+
+set(linked_libraries cudf::cudf)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
+)
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index 8961675711f..20725c252fc 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -19,9 +19,9 @@ from cudf._lib.exception_handler cimport cudf_exception_handler
 ctypedef const scalar constscalar
 
 cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
-    ctypedef enum out_of_bounds_policy:
-        NULLIFY 'cudf::out_of_bounds_policy::NULLIFY'
-        DONT_CHECK 'cudf::out_of_bounds_policy::DONT_CHECK'
+    cpdef enum class out_of_bounds_policy(bool):
+        NULLIFY
+        DONT_CHECK
 
     cdef unique_ptr[table] gather (
         const table_view& source_table,
diff --git a/python/cudf/cudf/_lib/cpp/copying.pyx b/python/cudf/cudf/_lib/cpp/copying.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index 11480d774ef..14bf8a83de0 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -4,6 +4,11 @@ from libc.stdint cimport int32_t, uint32_t
 
 
 cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
+    # The declaration below is to work around
+    # https://github.com/cython/cython/issues/5637
+    """
+    #define __PYX_ENUM_CLASS_DECL enum
+    """
     ctypedef int32_t size_type
     ctypedef uint32_t bitmask_type
     ctypedef uint32_t char_utf8
@@ -49,36 +54,36 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         ALL_EQUAL "cudf::nan_equality::ALL_EQUAL"
         NANS_UNEQUAL "cudf::nan_equality::UNEQUAL"
 
-    ctypedef enum type_id "cudf::type_id":
-        EMPTY                  "cudf::type_id::EMPTY"
-        INT8                   "cudf::type_id::INT8"
-        INT16                  "cudf::type_id::INT16"
-        INT32                  "cudf::type_id::INT32"
-        INT64                  "cudf::type_id::INT64"
-        UINT8                  "cudf::type_id::UINT8"
-        UINT16                 "cudf::type_id::UINT16"
-        UINT32                 "cudf::type_id::UINT32"
-        UINT64                 "cudf::type_id::UINT64"
-        FLOAT32                "cudf::type_id::FLOAT32"
-        FLOAT64                "cudf::type_id::FLOAT64"
-        BOOL8                  "cudf::type_id::BOOL8"
-        TIMESTAMP_DAYS         "cudf::type_id::TIMESTAMP_DAYS"
-        TIMESTAMP_SECONDS      "cudf::type_id::TIMESTAMP_SECONDS"
-        TIMESTAMP_MILLISECONDS "cudf::type_id::TIMESTAMP_MILLISECONDS"
-        TIMESTAMP_MICROSECONDS "cudf::type_id::TIMESTAMP_MICROSECONDS"
-        TIMESTAMP_NANOSECONDS  "cudf::type_id::TIMESTAMP_NANOSECONDS"
-        DICTIONARY32           "cudf::type_id::DICTIONARY32"
-        STRING                 "cudf::type_id::STRING"
-        LIST                   "cudf::type_id::LIST"
-        STRUCT                 "cudf::type_id::STRUCT"
-        NUM_TYPE_IDS           "cudf::type_id::NUM_TYPE_IDS"
-        DURATION_SECONDS       "cudf::type_id::DURATION_SECONDS"
-        DURATION_MILLISECONDS  "cudf::type_id::DURATION_MILLISECONDS"
-        DURATION_MICROSECONDS  "cudf::type_id::DURATION_MICROSECONDS"
-        DURATION_NANOSECONDS   "cudf::type_id::DURATION_NANOSECONDS"
-        DECIMAL32              "cudf::type_id::DECIMAL32"
-        DECIMAL64              "cudf::type_id::DECIMAL64"
-        DECIMAL128             "cudf::type_id::DECIMAL128"
+    cpdef enum class type_id(int32_t):
+        EMPTY
+        INT8
+        INT16
+        INT32
+        INT64
+        UINT8
+        UINT16
+        UINT32
+        UINT64
+        FLOAT32
+        FLOAT64
+        BOOL8
+        TIMESTAMP_DAYS
+        TIMESTAMP_SECONDS
+        TIMESTAMP_MILLISECONDS
+        TIMESTAMP_MICROSECONDS
+        TIMESTAMP_NANOSECONDS
+        DICTIONARY32
+        STRING
+        LIST
+        STRUCT
+        NUM_TYPE_IDS
+        DURATION_SECONDS
+        DURATION_MILLISECONDS
+        DURATION_MICROSECONDS
+        DURATION_NANOSECONDS
+        DECIMAL32
+        DECIMAL64
+        DECIMAL128
 
     cdef cppclass data_type:
         data_type() except +
diff --git a/python/cudf/cudf/_lib/cpp/types.pyx b/python/cudf/cudf/_lib/cpp/types.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index b4f8bfad4fb..ba7822b0a54 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -5,13 +5,14 @@ from . cimport copying
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
 from .table cimport Table
-from .types cimport DataType, TypeId
+# TODO: cimport type_id once
+# https://github.com/cython/cython/issues/5609 is resolved
+from .types cimport DataType
 
 __all__ = [
     "Column",
     "DataType",
     "Table",
-    "TypeId",
     "copying",
     "gpumemoryview",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index d9b2ca98ead..40afc8aaa8a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -9,7 +9,7 @@ from cudf._lib.cpp.column.column cimport column, column_contents
 from cudf._lib.cpp.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
-from .types cimport DataType, TypeId
+from .types cimport DataType, type_id
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
 
@@ -179,10 +179,11 @@ cdef class Column:
     cpdef list children(self):
         return self._children
 
+
 cdef class ListColumnView:
     """Accessor for methods of a Column that are specific to lists."""
     def __init__(self, Column col):
-        if col.type().id() != TypeId.LIST:
+        if col.type().id() != type_id.LIST:
             raise TypeError("Column is not a list type")
         self._column = col
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
index 0ebffacfb9f..d57be650710 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
@@ -2,30 +2,14 @@
 
 from libcpp cimport bool as cbool
 
-from cudf._lib.cpp cimport copying as cpp_copying
+from cudf._lib.cpp.copying cimport out_of_bounds_policy
 
 from .column cimport Column
 from .table cimport Table
 
-ctypedef cbool underlying_type_t_out_of_bounds_policy
-
-
-# Enum representing possible enum policies. This is the Cython representation
-# of libcudf's out_of_bounds_policy.
-cpdef enum OutOfBoundsPolicy:
-    NULLIFY = <underlying_type_t_out_of_bounds_policy> cpp_copying.NULLIFY
-    DONT_CHECK = (
-        <underlying_type_t_out_of_bounds_policy> cpp_copying.DONT_CHECK
-    )
-
-
-cdef cpp_copying.out_of_bounds_policy py_policy_to_c_policy(
-    OutOfBoundsPolicy py_policy
-) nogil
-
 
 cpdef Table gather(
     Table source_table,
     Column gather_map,
-    OutOfBoundsPolicy bounds_policy
+    out_of_bounds_policy bounds_policy
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
index 7869a917983..a27b44b3107 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
@@ -8,27 +8,23 @@ from libcpp.utility cimport move
 # we really want here would be
 # cimport libcudf... libcudf.copying.algo(...)
 from cudf._lib.cpp cimport copying as cpp_copying
+from cudf._lib.cpp.copying cimport out_of_bounds_policy
+
+from cudf._lib.cpp.copying import \
+    out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
+
 from cudf._lib.cpp.table.table cimport table
 
 from .column cimport Column
 from .table cimport Table
 
 
-cdef inline cpp_copying.out_of_bounds_policy py_policy_to_c_policy(
-    OutOfBoundsPolicy py_policy
-) nogil:
-    """Convert a Cython policy the corresponding libcudf policy type."""
-    return <cpp_copying.out_of_bounds_policy> (
-        <underlying_type_t_out_of_bounds_policy> py_policy
-    )
-
-
 # TODO: Is it OK to reference the corresponding libcudf algorithm in the
 # documentation? Otherwise there's a lot of room for duplication.
 cpdef Table gather(
     Table source_table,
     Column gather_map,
-    OutOfBoundsPolicy bounds_policy
+    out_of_bounds_policy bounds_policy
 ):
     """Select rows from source_table according to the provided gather_map.
 
@@ -40,7 +36,7 @@ cpdef Table gather(
         The table object from which to pull data.
     gather_map : Column
         The list of row indices to pull out of the source table.
-    bounds_policy : OutOfBoundsPolicy
+    bounds_policy : out_of_bounds_policy
         Controls whether out of bounds indices are checked and nullified in the
         output or if indices are assumed to be in bounds.
 
@@ -55,7 +51,7 @@ cpdef Table gather(
             cpp_copying.gather(
                 source_table.view(),
                 gather_map.view(),
-                py_policy_to_c_policy(bounds_policy)
+                bounds_policy
             )
         )
     return Table.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
index af0de6ba446..80baa484be7 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -3,64 +3,13 @@
 from libc.stdint cimport int32_t
 from libcpp cimport bool as cbool
 
-from cudf._lib.cpp.types cimport data_type, interpolation, null_policy, type_id
-
-ctypedef int32_t underlying_type_t_type_id
-
-
-# Enum representing possible data type ids. This is the Cython representation
-# of libcudf's type_id.
-cpdef enum TypeId:
-    EMPTY = <underlying_type_t_type_id> type_id.EMPTY
-    INT8 = <underlying_type_t_type_id> type_id.INT8
-    INT16 = <underlying_type_t_type_id> type_id.INT16
-    INT32 = <underlying_type_t_type_id> type_id.INT32
-    INT64 = <underlying_type_t_type_id> type_id.INT64
-    UINT8 = <underlying_type_t_type_id> type_id.UINT8
-    UINT16 = <underlying_type_t_type_id> type_id.UINT16
-    UINT32 = <underlying_type_t_type_id> type_id.UINT32
-    UINT64 = <underlying_type_t_type_id> type_id.UINT64
-    FLOAT32 = <underlying_type_t_type_id> type_id.FLOAT32
-    FLOAT64 = <underlying_type_t_type_id> type_id.FLOAT64
-    BOOL8 = <underlying_type_t_type_id> type_id.BOOL8
-    TIMESTAMP_DAYS = <underlying_type_t_type_id> type_id.TIMESTAMP_DAYS
-    TIMESTAMP_SECONDS = <underlying_type_t_type_id> type_id.TIMESTAMP_SECONDS
-    TIMESTAMP_MILLISECONDS = (
-        <underlying_type_t_type_id> type_id.TIMESTAMP_MILLISECONDS
-    )
-    TIMESTAMP_MICROSECONDS = (
-        <underlying_type_t_type_id> type_id.TIMESTAMP_MICROSECONDS
-    )
-    TIMESTAMP_NANOSECONDS = (
-        <underlying_type_t_type_id> type_id.TIMESTAMP_NANOSECONDS
-    )
-    DICTIONARY32 = <underlying_type_t_type_id> type_id.DICTIONARY32
-    STRING = <underlying_type_t_type_id> type_id.STRING
-    LIST = <underlying_type_t_type_id> type_id.LIST
-    STRUCT = <underlying_type_t_type_id> type_id.STRUCT
-    NUM_TYPE_IDS = <underlying_type_t_type_id> type_id.NUM_TYPE_IDS
-    DURATION_SECONDS = <underlying_type_t_type_id> type_id.DURATION_SECONDS
-    DURATION_MILLISECONDS = (
-        <underlying_type_t_type_id> type_id.DURATION_MILLISECONDS
-    )
-    DURATION_MICROSECONDS = (
-        <underlying_type_t_type_id> type_id.DURATION_MICROSECONDS
-    )
-    DURATION_NANOSECONDS = (
-        <underlying_type_t_type_id> type_id.DURATION_NANOSECONDS
-    )
-    DECIMAL32 = <underlying_type_t_type_id> type_id.DECIMAL32
-    DECIMAL64 = <underlying_type_t_type_id> type_id.DECIMAL64
-    DECIMAL128 = <underlying_type_t_type_id> type_id.DECIMAL128
-
-
-cdef type_id py_type_to_c_type(TypeId py_type_id) nogil
+from cudf._lib.cpp.types cimport data_type, type_id
 
 
 cdef class DataType:
     cdef data_type c_obj
 
-    cpdef TypeId id(self)
+    cpdef type_id id(self)
     cpdef int32_t scale(self)
 
     @staticmethod
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index 23d24182ac4..b1391723f0e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,11 +2,9 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.cpp.types cimport type_id
+from cudf._lib.cpp.types cimport data_type, type_id
 
-
-cdef type_id py_type_to_c_type(TypeId py_type_id) nogil:
-    return <type_id> (<underlying_type_t_type_id> py_type_id)
+from cudf._lib.cpp.types import type_id as TypeId  # no-cython-lint
 
 
 cdef class DataType:
@@ -21,13 +19,13 @@ cdef class DataType:
     scale : int
         The scale associated with the data. Only used for decimal data types.
     """
-    def __cinit__(self, TypeId id, int32_t scale=0):
-        self.c_obj = data_type(py_type_to_c_type(id), scale)
+    def __cinit__(self, type_id id, int32_t scale=0):
+        self.c_obj = data_type(id, scale)
 
     # TODO: Consider making both id and scale cached properties.
-    cpdef TypeId id(self):
+    cpdef type_id id(self):
         """Get the id associated with this data type."""
-        return TypeId(self.c_obj.id())
+        return self.c_obj.id()
 
     cpdef int32_t scale(self):
         """Get the scale associated with this data type."""
@@ -42,6 +40,6 @@ cdef class DataType:
         (even direct pylibcudf Cython users).
         """
         # Spoof an empty data type then swap in the real one.
-        cdef DataType ret = DataType.__new__(DataType, TypeId.EMPTY)
+        cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY)
         ret.c_obj = dt
         return ret
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 39a1b0609cf..0407785b2d8 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -200,23 +200,23 @@ cdef class DeviceScalar:
         if dtype is not None:
             s._dtype = dtype
         elif cdtype.id() in {
-            libcudf_types.DECIMAL32,
-            libcudf_types.DECIMAL64,
-            libcudf_types.DECIMAL128,
+            libcudf_types.type_id.DECIMAL32,
+            libcudf_types.type_id.DECIMAL64,
+            libcudf_types.type_id.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        elif cdtype.id() == libcudf_types.STRUCT:
+        elif cdtype.id() == libcudf_types.type_id.STRUCT:
             struct_table_view = (<struct_scalar*>s.get_raw_ptr())[0].view()
             s._dtype = StructDtype({
                 str(i): dtype_from_column_view(struct_table_view.column(i))
                 for i in range(struct_table_view.num_columns())
             })
-        elif cdtype.id() == libcudf_types.LIST:
+        elif cdtype.id() == libcudf_types.type_id.LIST:
             if (
                 <list_scalar*>s.get_raw_ptr()
-            )[0].view().type().id() == libcudf_types.LIST:
+            )[0].view().type().id() == libcudf_types.type_id.LIST:
                 s._dtype = dtype_from_column_view(
                     (<list_scalar*>s.get_raw_ptr())[0].view()
                 )
@@ -442,27 +442,27 @@ cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s):
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
-    if cdtype.id() == libcudf_types.INT8:
+    if cdtype.id() == libcudf_types.type_id.INT8:
         return np.int8((<numeric_scalar[int8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.INT16:
+    elif cdtype.id() == libcudf_types.type_id.INT16:
         return np.int16((<numeric_scalar[int16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.INT32:
+    elif cdtype.id() == libcudf_types.type_id.INT32:
         return np.int32((<numeric_scalar[int32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.INT64:
+    elif cdtype.id() == libcudf_types.type_id.INT64:
         return np.int64((<numeric_scalar[int64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.UINT8:
+    elif cdtype.id() == libcudf_types.type_id.UINT8:
         return np.uint8((<numeric_scalar[uint8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.UINT16:
+    elif cdtype.id() == libcudf_types.type_id.UINT16:
         return np.uint16((<numeric_scalar[uint16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.UINT32:
+    elif cdtype.id() == libcudf_types.type_id.UINT32:
         return np.uint32((<numeric_scalar[uint32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.UINT64:
+    elif cdtype.id() == libcudf_types.type_id.UINT64:
         return np.uint64((<numeric_scalar[uint64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.FLOAT32:
+    elif cdtype.id() == libcudf_types.type_id.FLOAT32:
         return np.float32((<numeric_scalar[float]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.FLOAT64:
+    elif cdtype.id() == libcudf_types.type_id.FLOAT64:
         return np.float64((<numeric_scalar[double]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.BOOL8:
+    elif cdtype.id() == libcudf_types.type_id.BOOL8:
         return np.bool_((<numeric_scalar[bool]*>s_ptr)[0].value())
     else:
         raise ValueError("Could not convert cudf::scalar to numpy scalar")
@@ -475,15 +475,15 @@ cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
-    if cdtype.id() == libcudf_types.DECIMAL64:
+    if cdtype.id() == libcudf_types.type_id.DECIMAL64:
         rep_val = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.DECIMAL32:
+    elif cdtype.id() == libcudf_types.type_id.DECIMAL32:
         rep_val = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.DECIMAL128:
+    elif cdtype.id() == libcudf_types.type_id.DECIMAL128:
         rep_val = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
@@ -499,28 +499,28 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s):
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
-    if cdtype.id() == libcudf_types.TIMESTAMP_SECONDS:
+    if cdtype.id() == libcudf_types.type_id.TIMESTAMP_SECONDS:
         return np.datetime64(
             (
                 <timestamp_scalar[timestamp_ms]*> s_ptr
             )[0].ticks_since_epoch_64(),
             datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.TIMESTAMP_MILLISECONDS:
+    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MILLISECONDS:
         return np.datetime64(
             (
                 <timestamp_scalar[timestamp_ms]*> s_ptr
             )[0].ticks_since_epoch_64(),
             datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.TIMESTAMP_MICROSECONDS:
+    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MICROSECONDS:
         return np.datetime64(
             (
                 <timestamp_scalar[timestamp_ms]*> s_ptr
             )[0].ticks_since_epoch_64(),
             datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.TIMESTAMP_NANOSECONDS:
+    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_NANOSECONDS:
         return np.datetime64(
             (
                 <timestamp_scalar[timestamp_ms]*> s_ptr
@@ -540,28 +540,28 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
-    if cdtype.id() == libcudf_types.DURATION_SECONDS:
+    if cdtype.id() == libcudf_types.type_id.DURATION_SECONDS:
         return np.timedelta64(
             (
                 <duration_scalar[duration_s]*> s_ptr
             )[0].ticks(),
             duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.DURATION_MILLISECONDS:
+    elif cdtype.id() == libcudf_types.type_id.DURATION_MILLISECONDS:
         return np.timedelta64(
             (
                 <duration_scalar[duration_ms]*> s_ptr
             )[0].ticks(),
             duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.DURATION_MICROSECONDS:
+    elif cdtype.id() == libcudf_types.type_id.DURATION_MICROSECONDS:
         return np.timedelta64(
             (
                 <duration_scalar[duration_us]*> s_ptr
             )[0].ticks(),
             duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.DURATION_NANOSECONDS:
+    elif cdtype.id() == libcudf_types.type_id.DURATION_NANOSECONDS:
         return np.timedelta64(
             (
                 <duration_scalar[duration_ns]*> s_ptr
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 177cbffddb0..2085d5c2896 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import cudf
 
@@ -15,7 +15,7 @@ from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.cpp.types cimport DECIMAL32, DECIMAL64, DECIMAL128, data_type
+from cudf._lib.cpp.types cimport data_type, type_id
 
 
 @acquire_spill_lock()
@@ -61,11 +61,11 @@ def to_decimal(Column input_col, object out_type):
     cdef int scale = out_type.scale
     cdef data_type c_out_type
     if isinstance(out_type, cudf.Decimal32Dtype):
-        c_out_type = data_type(DECIMAL32, -scale)
+        c_out_type = data_type(type_id.DECIMAL32, -scale)
     elif isinstance(out_type, cudf.Decimal64Dtype):
-        c_out_type = data_type(DECIMAL64, -scale)
+        c_out_type = data_type(type_id.DECIMAL64, -scale)
     elif isinstance(out_type, cudf.Decimal128Dtype):
-        c_out_type = data_type(DECIMAL128, -scale)
+        c_out_type = data_type(type_id.DECIMAL128, -scale)
     else:
         raise TypeError("should be a decimal dtype")
     with nogil:
@@ -100,7 +100,7 @@ def is_fixed_point(Column input_col, object dtype):
     cdef unique_ptr[column] c_result
     cdef column_view source_view = input_col.view()
     cdef int scale = dtype.scale
-    cdef data_type c_dtype = data_type(DECIMAL64, -scale)
+    cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale)
     with nogil:
         c_result = move(cpp_is_fixed_point(
             source_view,
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 8594e37ac4a..929f8b447ab 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -236,6 +236,7 @@ cdef dtype_from_column_view(column_view cv):
         ]
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
+    cdef libcudf_types.type_id tid
     if cudf.api.types.is_list_dtype(dtype):
         tid = libcudf_types.type_id.LIST
     elif cudf.api.types.is_struct_dtype(dtype):

From a025db54a92ad967827ad6f6f2b251065fe09c73 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Sat, 26 Aug 2023 01:25:16 -0700
Subject: [PATCH 108/230] Fix for encodings listed in the Parquet column chunk
 metadata (#13907)

With the addition of V2 page headers, the encodings used have also changed. This PR correctly determines the encodings used in each column chunk and writes that information to the column chunk metadata.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13907
---
 cpp/src/io/parquet/page_enc.cu        | 21 ++++++++
 cpp/src/io/parquet/parquet_common.hpp |  1 +
 cpp/src/io/parquet/parquet_gpu.hpp    | 13 ++++-
 cpp/src/io/parquet/writer_impl.cu     | 29 ++++++-----
 cpp/tests/io/parquet_test.cpp         | 70 +++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index d066b454840..0af561be8da 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -229,6 +229,16 @@ Encoding __device__ determine_encoding(PageType page_type,
   }
 }
 
+// operator to use with warp_reduce. stolen from cub::Sum
+struct BitwiseOr {
+  /// Binary OR operator, returns <tt>a | b</tt>
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(T const& a, T const& b) const
+  {
+    return a | b;
+  }
+};
+
 }  // anonymous namespace
 
 // blockDim {512,1,1}
@@ -1445,6 +1455,7 @@ __global__ void __launch_bounds__(decide_compression_block_size)
 
   uint32_t uncompressed_data_size = 0;
   uint32_t compressed_data_size   = 0;
+  uint32_t encodings              = 0;
   auto const num_pages            = ck_g[warp_id].num_pages;
   for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) {
     auto const& curr_page     = ck_g[warp_id].pages[page_id];
@@ -1457,10 +1468,14 @@ __global__ void __launch_bounds__(decide_compression_block_size)
         atomicOr(&compression_error[warp_id], 1);
       }
     }
+    // collect encoding info for the chunk metadata
+    encodings |= encoding_to_mask(curr_page.encoding);
   }
   uncompressed_data_size = warp_reduce(temp_storage[warp_id][0]).Sum(uncompressed_data_size);
   compressed_data_size   = warp_reduce(temp_storage[warp_id][1]).Sum(compressed_data_size);
   __syncwarp();
+  encodings = warp_reduce(temp_storage[warp_id][0]).Reduce(encodings, BitwiseOr{});
+  __syncwarp();
 
   if (lane_id == 0) {
     auto const write_compressed = compressed_data_size != 0 and compression_error[warp_id] == 0 and
@@ -1469,6 +1484,12 @@ __global__ void __launch_bounds__(decide_compression_block_size)
     chunks[chunk_id].bfr_size      = uncompressed_data_size;
     chunks[chunk_id].compressed_size =
       write_compressed ? compressed_data_size : uncompressed_data_size;
+
+    // if there is repetition or definition level data add RLE encoding
+    auto const rle_bits =
+      ck_g[warp_id].col_desc->num_def_level_bits() + ck_g[warp_id].col_desc->num_rep_level_bits();
+    if (rle_bits > 0) { encodings |= encoding_to_mask(Encoding::RLE); }
+    chunks[chunk_id].encodings = encodings;
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index ab6290c4ed6..5f8f1617cb9 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -92,6 +92,7 @@ enum class Encoding : uint8_t {
   DELTA_BYTE_ARRAY        = 7,
   RLE_DICTIONARY          = 8,
   BYTE_STREAM_SPLIT       = 9,
+  NUM_ENCODINGS           = 10,
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 0a8640aef26..e82b6abc13d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -345,8 +345,8 @@ struct parquet_column_device_view : stats_column_desc {
   ConvertedType converted_type;  //!< logical data type
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
-  constexpr uint8_t num_def_level_bits() { return level_bits & 0xf; }
-  constexpr uint8_t num_rep_level_bits() { return level_bits >> 4; }
+  constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
+  constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
   size_type const* const*
     nesting_offsets;  //!< If column is a nested type, contains offset array of each nesting level
 
@@ -384,6 +384,12 @@ constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 struct EncPage;
 struct slot_type;
 
+// convert Encoding to a mask value
+constexpr uint32_t encoding_to_mask(Encoding encoding)
+{
+  return 1 << static_cast<uint32_t>(encoding);
+}
+
 /**
  * @brief Struct describing an encoder column chunk
  */
@@ -420,6 +426,7 @@ struct EncColumnChunk {
   bool use_dictionary;    //!< True if the chunk uses dictionary encoding
   uint8_t* column_index_blob;  //!< Binary blob containing encoded column index for this chunk
   uint32_t column_index_size;  //!< Size of column index blob
+  uint32_t encodings;          //!< Mask representing the set of encodings used for this chunk
 };
 
 /**
@@ -748,6 +755,8 @@ void EncodePages(device_span<EncPage> pages,
 /**
  * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision
  *
+ * Also calculates the set of page encodings used for each chunk.
+ *
  * @param[in,out] chunks Column chunks (updated with actual compressed/uncompressed sizes)
  * @param[in] stream CUDA stream to use
  */
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index c5fc852d20b..d2976a3f5d9 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -193,6 +193,20 @@ parquet::Compression to_parquet_compression(compression_type compression)
   }
 }
 
+/**
+ * @brief Convert a mask of encodings to a vector.
+ *
+ * @param encodings Vector of `Encoding`s to populate
+ * @param enc_mask Mask of encodings used
+ */
+void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
+{
+  for (uint8_t enc = 0; enc < static_cast<uint8_t>(Encoding::NUM_ENCODINGS); enc++) {
+    auto const enc_enum = static_cast<Encoding>(enc);
+    if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
+  }
+}
+
 /**
  * @brief Compute size (in bytes) of the data stored in the given column.
  *
@@ -1671,6 +1685,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         ck.start_row         = start_row;
         ck.num_rows          = (uint32_t)row_group.num_rows;
         ck.first_fragment    = c * num_fragments + f;
+        ck.encodings         = 0;
         auto chunk_fragments = row_group_fragments[c].subspan(f, fragments_in_chunk);
         // In fragment struct, add a pointer to the chunk it belongs to
         // In each fragment in chunk_fragments, update the chunk pointer here.
@@ -1687,7 +1702,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
           });
         auto& column_chunk_meta          = row_group.columns[c].meta_data;
         column_chunk_meta.type           = parquet_columns[c].physical_type();
-        column_chunk_meta.encodings      = {Encoding::PLAIN, Encoding::RLE};
         column_chunk_meta.path_in_schema = parquet_columns[c].get_path_in_schema();
         column_chunk_meta.codec          = UNCOMPRESSED;
         column_chunk_meta.num_values     = ck.num_values;
@@ -1703,17 +1717,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   row_group_fragments.host_to_device_async(stream);
   [[maybe_unused]] auto dict_info_owner = build_chunk_dictionaries(
     chunks, col_desc, row_group_fragments, compression, dict_policy, max_dictionary_size, stream);
-  for (size_t p = 0; p < partitions.size(); p++) {
-    for (int rg = 0; rg < num_rg_in_part[p]; rg++) {
-      size_t global_rg = global_rowgroup_base[p] + rg;
-      for (int col = 0; col < num_columns; col++) {
-        if (chunks.host_view()[rg][col].use_dictionary) {
-          agg_meta->file(p).row_groups[global_rg].columns[col].meta_data.encodings.push_back(
-            Encoding::PLAIN_DICTIONARY);
-        }
-      }
-    }
-  }
 
   // The code preceding this used a uniform fragment size for all columns. Now recompute
   // fragments with a (potentially) varying number of fragments per column.
@@ -1949,6 +1952,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
         max_write_size = std::max(max_write_size, ck.compressed_size);
 
+        update_chunk_encodings(column_chunk_meta.encodings, ck.encodings);
+
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
             device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 8c7d598d33f..b210452f619 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -6599,4 +6599,74 @@ TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_P(ParquetV2Test, CheckEncodings)
+{
+  using cudf::io::parquet::Encoding;
+  constexpr auto num_rows = 100'000;
+  auto const is_v2        = GetParam();
+
+  auto const validity = cudf::test::iterators::no_nulls();
+  // data should be PLAIN for v1, RLE for V2
+  auto col0_data =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; });
+  // data should be PLAIN for both
+  auto col1_data = random_values<int32_t>(num_rows);
+  // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2
+  auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0{col0_data, col0_data + num_rows, validity};
+  column_wrapper<int32_t> col1{col1_data.begin(), col1_data.end(), validity};
+  column_wrapper<int32_t> col2{col2_data, col2_data + num_rows, validity};
+
+  auto expected = table_view{{col0, col1, col2}};
+
+  auto const filename = is_v2 ? "CheckEncodingsV2.parquet" : "CheckEncodingsV1.parquet";
+  auto filepath       = temp_env->get_temp_filepath(filename);
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .max_page_size_rows(num_rows)
+      .write_v2_headers(is_v2);
+  cudf::io::write_parquet(out_opts);
+
+  // make sure the expected encodings are present
+  auto contains = [](auto const& vec, auto const& enc) {
+    return std::find(vec.begin(), vec.end(), enc) != vec.end();
+  };
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
+
+  read_footer(source, &fmd);
+  auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings;
+  auto const& chunk1_enc = fmd.row_groups[0].columns[1].meta_data.encodings;
+  auto const& chunk2_enc = fmd.row_groups[0].columns[2].meta_data.encodings;
+  if (is_v2) {
+    // col0 should have RLE for rep/def and data
+    EXPECT_TRUE(chunk0_enc.size() == 1);
+    EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE));
+    // col1 should have RLE for rep/def and PLAIN for data
+    EXPECT_TRUE(chunk1_enc.size() == 2);
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN));
+    // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data
+    EXPECT_TRUE(chunk2_enc.size() == 3);
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN));
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE_DICTIONARY));
+  } else {
+    // col0 should have RLE for rep/def and PLAIN for data
+    EXPECT_TRUE(chunk0_enc.size() == 2);
+    EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk0_enc, Encoding::PLAIN));
+    // col1 should have RLE for rep/def and PLAIN for data
+    EXPECT_TRUE(chunk1_enc.size() == 2);
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN));
+    // col2 should have RLE for rep/def and PLAIN_DICTIONARY for data and dict
+    EXPECT_TRUE(chunk2_enc.size() == 2);
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN_DICTIONARY));
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 2c7f02c399e58538a7f772e86839c05d3e80ca19 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Sun, 27 Aug 2023 13:42:36 -0400
Subject: [PATCH 109/230] Use `thread_index_type` in `partitioning.cu` (#13973)

This PR uses `cudf::thread_index_type` to avoid overflows.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/13973
---
 cpp/src/partitioning/partitioning.cu | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 0d94db110b4..ff9c4ea2f59 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -134,7 +134,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   // Accumulate histogram of the size of each partition in shared memory
   extern __shared__ size_type shared_partition_sizes[];
 
-  size_type row_number = threadIdx.x + blockIdx.x * blockDim.x;
+  auto tid = cudf::thread_index_type{threadIdx.x} +
+             cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
 
   // Initialize local histogram
   size_type partition_number = threadIdx.x;
@@ -148,7 +149,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   // Compute the hash value for each row, store it to the array of hash values
   // and compute the partition to which the hash value belongs and increment
   // the shared memory counter for that partition
-  while (row_number < num_rows) {
+  while (tid < num_rows) {
+    auto const row_number                = static_cast<size_type>(tid);
     hash_value_type const row_hash_value = the_hasher(row_number);
 
     size_type const partition_number = the_partitioner(row_hash_value);
@@ -158,7 +160,7 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
     row_partition_offset[row_number] =
       atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1));
 
-    row_number += blockDim.x * gridDim.x;
+    tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
   }
 
   __syncthreads();
@@ -213,12 +215,14 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
   }
   __syncthreads();
 
-  size_type row_number = threadIdx.x + blockIdx.x * blockDim.x;
+  auto tid = cudf::thread_index_type{threadIdx.x} +
+             cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
 
   // Get each row's partition number, and get it's output location by
   // incrementing block's offset counter for that partition number
   // and store the row's output location in-place
-  while (row_number < num_rows) {
+  while (tid < num_rows) {
+    auto const row_number = static_cast<size_type>(tid);
     // Get partition number of this row
     size_type const partition_number = row_partition_numbers[row_number];
 
@@ -230,7 +234,7 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
     // Store the row's output location in-place
     row_partition_numbers[row_number] = row_output_location;
 
-    row_number += blockDim.x * gridDim.x;
+    tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
   }
 }
 
@@ -307,8 +311,11 @@ __global__ void copy_block_partitions(InputIter input_iter,
   __syncthreads();
 
   // Fetch the input data to shared memory
-  for (size_type row_number = threadIdx.x + blockIdx.x * blockDim.x; row_number < num_rows;
-       row_number += blockDim.x * gridDim.x) {
+  for (auto tid = cudf::thread_index_type{threadIdx.x} +
+                  cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
+       tid < num_rows;
+       tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}) {
+    auto const row_number      = static_cast<size_type>(tid);
     size_type const ipartition = row_partition_numbers[row_number];
 
     block_output[partition_offset_shared[ipartition] + row_partition_offset[row_number]] =

From aba001c12f8db876ab7b763fcde939dba9efd665 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 28 Aug 2023 15:23:34 +0530
Subject: [PATCH 110/230] Use cuco::static_set in JSON tree algorithm (#13928)

In JSON tree algorithms of JSON reader, cuco static_map is used as a set. This PR replaces it with static_set.
No tests are changed. No significant runtime changes.
Addresses part of  #12261

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/13928
---
 cpp/src/io/json/json_tree.cu | 85 ++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 3f1f2e81d21..9231040eb70 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -35,7 +35,7 @@
 
 #include <cub/device/device_radix_sort.cuh>
 
-#include <cuco/static_map.cuh>
+#include <cuco/static_set.cuh>
 
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
@@ -400,8 +400,6 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
 {
   CUDF_FUNC_RANGE();
   using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
-  using hash_map_type =
-    cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
 
   auto const num_nodes  = d_tree.node_categories.size();
   auto const num_fields = thrust::count(rmm::exec_policy(stream),
@@ -409,12 +407,6 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
                                         d_tree.node_categories.end(),
                                         node_t::NC_FN);
 
-  constexpr size_type empty_node_index_sentinel = -1;
-  hash_map_type key_map{compute_hash_table_size(num_fields, 40),  // 40% occupancy in hash map
-                        cuco::empty_key{empty_node_index_sentinel},
-                        cuco::empty_value{empty_node_index_sentinel},
-                        hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value()};
   auto const d_hasher = [d_input          = d_input.data(),
                          node_range_begin = d_tree.node_range_begin.data(),
                          node_range_end   = d_tree.node_range_end.data()] __device__(auto node_id) {
@@ -434,25 +426,33 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
   };
   // key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to
   // hashing)
-  auto const iter = cudf::detail::make_counting_transform_iterator(
-    0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
+  auto const iter = thrust::make_counting_iterator<size_type>(0);
 
   auto const is_field_name_node = [node_categories =
                                      d_tree.node_categories.data()] __device__(auto node_id) {
     return node_categories[node_id] == node_t::NC_FN;
   };
-  key_map.insert_if(iter,
-                    iter + num_nodes,
-                    thrust::counting_iterator<size_type>(0),  // stencil
-                    is_field_name_node,
-                    d_hasher,
-                    d_equal,
-                    stream.value());
+
+  using hasher_type                             = decltype(d_hasher);
+  constexpr size_type empty_node_index_sentinel = -1;
+  auto key_set =
+    cuco::experimental::static_set{cuco::experimental::extent{compute_hash_table_size(
+                                     num_fields, 40)},  // 40% occupancy in hash map
+                                   cuco::empty_key{empty_node_index_sentinel},
+                                   d_equal,
+                                   cuco::experimental::linear_probing<1, hasher_type>{d_hasher},
+                                   hash_table_allocator_type{default_allocator<char>{}, stream},
+                                   stream.value()};
+  key_set.insert_if_async(iter,
+                          iter + num_nodes,
+                          thrust::counting_iterator<size_type>(0),  // stencil
+                          is_field_name_node,
+                          stream.value());
 
   auto const get_hash_value =
-    [key_map = key_map.get_device_view(), d_hasher, d_equal] __device__(auto node_id) -> size_type {
-    auto const it = key_map.find(node_id, d_hasher, d_equal);
-    return (it == key_map.end()) ? size_type{0} : it->second.load(cuda::std::memory_order_relaxed);
+    [key_set = key_set.ref(cuco::experimental::op::find)] __device__(auto node_id) -> size_type {
+    auto const it = key_set.find(node_id);
+    return (it == key_set.end()) ? size_type{0} : *it;
   };
 
   // convert field nodes to node indices, and other nodes to enum value.
@@ -528,7 +528,6 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = parent_node_ids.size();
-  rmm::device_uvector<size_type> col_id(num_nodes, stream, mr);
 
   // array of arrays
   NodeIndexT const row_array_children_level = is_enabled_lines ? 1 : 2;
@@ -560,17 +559,6 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
                     list_indices.begin());
   }
 
-  using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
-  using hash_map_type =
-    cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
-
-  constexpr size_type empty_node_index_sentinel = -1;
-  hash_map_type key_map{compute_hash_table_size(num_nodes),  // TODO reduce oversubscription
-                        cuco::empty_key{empty_node_index_sentinel},
-                        cuco::empty_value{empty_node_index_sentinel},
-                        cuco::erased_key{-2},
-                        hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value()};
   // path compression is not used since extra writes make all map operations slow.
   auto const d_hasher = [node_level      = node_levels.begin(),
                          node_type       = node_type.begin(),
@@ -632,23 +620,26 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
     return node_id1 == node_id2;
   };
 
+  constexpr size_type empty_node_index_sentinel = -1;
+  using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+  using hasher_type               = decltype(d_hashed_cache);
+
+  auto key_set = cuco::experimental::static_set{
+    cuco::experimental::extent{compute_hash_table_size(num_nodes)},
+    cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
+    d_equal,
+    cuco::experimental::linear_probing<1, hasher_type>{d_hashed_cache},
+    hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value()};
+
   // insert and convert node ids to unique set ids
-  auto const num_inserted = thrust::count_if(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(num_nodes),
-    [d_hashed_cache,
-     d_equal,
-     view       = key_map.get_device_mutable_view(),
-     uq_node_id = col_id.begin()] __device__(auto node_id) mutable {
-      auto it = view.insert_and_find(cuco::make_pair(node_id, node_id), d_hashed_cache, d_equal);
-      uq_node_id[node_id] = (it.first)->first.load(cuda::std::memory_order_relaxed);
-      return it.second;
-    });
+  auto nodes_itr         = thrust::make_counting_iterator<size_type>(0);
+  auto const num_columns = key_set.insert(nodes_itr, nodes_itr + num_nodes, stream.value());
 
-  auto const num_columns = num_inserted;  // key_map.get_size() is not updated.
   rmm::device_uvector<size_type> unique_keys(num_columns, stream);
-  key_map.retrieve_all(unique_keys.begin(), thrust::make_discard_iterator(), stream.value());
+  rmm::device_uvector<size_type> col_id(num_nodes, stream, mr);
+  key_set.find_async(nodes_itr, nodes_itr + num_nodes, col_id.begin(), stream.value());
+  std::ignore = key_set.retrieve_all(unique_keys.begin(), stream.value());
 
   return {std::move(col_id), std::move(unique_keys)};
 }

From d138dd0c9c365e03891d33cf4423a553629a3f6b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 28 Aug 2023 08:27:34 -0500
Subject: [PATCH 111/230] Restore column type metadata with `dropna` to fix
 `factorize` API (#13980)

closes #13979

This PR restores column type metadata for `dropna` call, absense of this restoration was causing an issue with the `CategoricalColumn.dropna` that was necessary for `factorize` API.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13980
---
 python/cudf/cudf/core/column/column.py   |  2 +-
 python/cudf/cudf/tests/test_factorize.py | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index eafcc18450d..b5332f35073 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -291,7 +291,7 @@ def any(self, skipna: bool = True) -> bool:
 
     def dropna(self, drop_nan: bool = False) -> ColumnBase:
         # The drop_nan argument is only used for numerical columns.
-        return drop_nulls([self])[0]
+        return drop_nulls([self])[0]._with_type_metadata(self.dtype)
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 90cf11d7dde..730bfdd8590 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -139,3 +139,21 @@ def test_factorize_result_classes():
 
     assert isinstance(labels, cp.ndarray)
     assert isinstance(cats, cp.ndarray)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["abc", "def", "abc", "a", "def", None],
+        [10, 20, 100, -10, 0, 1, None, 10, 100],
+    ],
+)
+def test_category_dtype_factorize(data):
+    gs = cudf.Series(data, dtype="category")
+    ps = gs.to_pandas()
+
+    actual_codes, actual_uniques = gs.factorize()
+    expected_codes, expected_uniques = ps.factorize()
+
+    assert_eq(actual_codes, expected_codes)
+    assert_eq(actual_uniques, expected_uniques)

From 8a78d68b8bdb6312d97e01b593814f27115a4727 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 28 Aug 2023 10:23:05 -0500
Subject: [PATCH 112/230] Fix `CategoricalIndex` ordering in `Groupby.agg` when
 pandas-compatibility mode is enabled (#13978)

closes #13974

This PR re-calculates the `CategoricalIndex`'s `categories` order to match the order in which the grouping has been done for the `CategoricalColumn`. This fix is being done only when pandas-compatibility mode is enabled.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13978
---
 python/cudf/cudf/core/groupby/groupby.py |  9 +++++++++
 python/cudf/cudf/tests/test_groupby.py   | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index cf4c861c28f..38b07eca330 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -613,6 +613,15 @@ def agg(self, func):
                     how="left",
                 )
                 result = result.take(indices)
+                if isinstance(result._index, cudf.CategoricalIndex):
+                    # Needs re-ordering the categories in the order
+                    # they are after grouping.
+                    result._index = cudf.Index(
+                        result._index._column.reorder_categories(
+                            result._index._column._get_decategorized_column()
+                        ),
+                        name=result._index.name,
+                    )
 
         if not self._as_index:
             result = result.reset_index()
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index b48ce210104..2ab8b29f224 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3440,3 +3440,19 @@ def test_groupby_consecutive_operations():
     expected = pg.cumsum()
 
     assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+
+def test_categorical_grouping_pandas_compatibility():
+    gdf = cudf.DataFrame(
+        {
+            "key": cudf.Series([2, 1, 3, 1, 1], dtype="category"),
+            "a": [0, 1, 3, 2, 3],
+        }
+    )
+    pdf = gdf.to_pandas()
+
+    with cudf.option_context("mode.pandas_compatible", True):
+        actual = gdf.groupby("key", sort=False).sum()
+    expected = pdf.groupby("key", sort=False).sum()
+
+    assert_eq(actual, expected)

From f9e35c7216ed433564d845298bb2e15f0c960461 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 28 Aug 2023 10:25:42 -0500
Subject: [PATCH 113/230] Enable `codes` dtype parity in pandas-compatibility
 mode for `factorize` API (#13982)

closes #13981

This PR enables parity with pandas `factorize` API by returning `codes` with `int64` dtype only in pandas-compatibility mode. When the pandas-compatibility mode is turned off, `cudf` will calculate the appropriate dtype that needs to be returned to save memory usage.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13982
---
 python/cudf/cudf/core/algorithms.py      |  5 ++++-
 python/cudf/cudf/tests/test_factorize.py | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index ff604d3252b..a472142ece0 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -9,6 +9,7 @@
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.core.series import Series
+from cudf.options import get_option
 
 
 def factorize(
@@ -137,7 +138,9 @@ def factorize(
         cats = cats.sort_values()
 
     labels = values._column._label_encoding(
-        cats=cats, na_sentinel=Scalar(na_sentinel)
+        cats=cats,
+        na_sentinel=Scalar(na_sentinel),
+        dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
     return labels, cats.values if return_cupy_array else Index(cats)
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 730bfdd8590..bf409b30090 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -122,6 +122,23 @@ def test_cudf_factorize_array():
     np.testing.assert_array_equal(expect[1], got[1].get())
 
 
+@pytest.mark.parametrize("pandas_compatibility", [True, False])
+def test_factorize_code_pandas_compatibility(pandas_compatibility):
+
+    psr = pd.Series([1, 2, 3, 4, 5])
+    gsr = cudf.from_pandas(psr)
+
+    expect = pd.factorize(psr)
+    with cudf.option_context("mode.pandas_compatible", pandas_compatibility):
+        got = cudf.factorize(gsr)
+    assert_eq(got[0], expect[0])
+    assert_eq(got[1], expect[1])
+    if pandas_compatibility:
+        assert got[0].dtype == expect[0].dtype
+    else:
+        assert got[0].dtype == cudf.dtype("int8")
+
+
 def test_factorize_result_classes():
     data = [1, 2, 3]
 

From 724e42ae685e2063865378a5ae904f5cd6d8b3e3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 28 Aug 2023 10:53:36 -0500
Subject: [PATCH 114/230] Fix integer overflow in shim `device_sum` functions
 (#13943)

Closes https://github.com/rapidsai/cudf/issues/13873

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13943
---
 python/cudf/cudf/tests/test_groupby.py | 17 +++++++++
 python/cudf/udf_cpp/shim.cu            | 50 +++++++++++---------------
 2 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 2ab8b29f224..042f0e1aa38 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -474,6 +474,23 @@ def func(group):
     run_groupby_apply_jit_test(data, func, ["a"])
 
 
+@pytest.mark.parametrize("dtype", ["int32"])
+def test_groupby_apply_jit_sum_integer_overflow(dtype):
+    max = np.iinfo(dtype).max
+
+    data = DataFrame(
+        {
+            "a": [0, 0, 0],
+            "b": [max, max, max],
+        }
+    )
+
+    def func(group):
+        return group["b"].sum()
+
+    run_groupby_apply_jit_test(data, func, ["a"])
+
+
 @pytest.mark.parametrize("dtype", ["float64"])
 @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu
index 686e39e7036..cabca3154be 100644
--- a/python/cudf/udf_cpp/shim.cu
+++ b/python/cudf/udf_cpp/shim.cu
@@ -388,26 +388,30 @@ __device__ bool are_all_nans(cooperative_groups::thread_block const& block,
   return count == 0;
 }
 
-template <typename T>
-__device__ void device_sum(cooperative_groups::thread_block const& block,
-                           T const* data,
-                           int64_t size,
-                           T* sum)
+template <typename T, typename AccumT = std::conditional_t<std::is_integral_v<T>, int64_t, T>>
+__device__ AccumT device_sum(cooperative_groups::thread_block const& block,
+                             T const* data,
+                             int64_t size)
 {
-  T local_sum = 0;
+  __shared__ AccumT block_sum;
+  if (block.thread_rank() == 0) { block_sum = 0; }
+  block.sync();
+
+  AccumT local_sum = 0;
 
   for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) {
-    local_sum += data[idx];
+    local_sum += static_cast<AccumT>(data[idx]);
   }
 
-  cuda::atomic_ref<T, cuda::thread_scope_block> ref{*sum};
+  cuda::atomic_ref<AccumT, cuda::thread_scope_block> ref{block_sum};
   ref.fetch_add(local_sum, cuda::std::memory_order_relaxed);
 
   block.sync();
+  return block_sum;
 }
 
-template <typename T>
-__device__ T BlockSum(T const* data, int64_t size)
+template <typename T, typename AccumT = std::conditional_t<std::is_integral_v<T>, int64_t, T>>
+__device__ AccumT BlockSum(T const* data, int64_t size)
 {
   auto block = cooperative_groups::this_thread_block();
 
@@ -415,11 +419,7 @@ __device__ T BlockSum(T const* data, int64_t size)
     if (are_all_nans(block, data, size)) { return 0; }
   }
 
-  __shared__ T block_sum;
-  if (block.thread_rank() == 0) { block_sum = 0; }
-  block.sync();
-
-  device_sum<T>(block, data, size, &block_sum);
+  auto block_sum = device_sum<T>(block, data, size);
   return block_sum;
 }
 
@@ -428,11 +428,7 @@ __device__ double BlockMean(T const* data, int64_t size)
 {
   auto block = cooperative_groups::this_thread_block();
 
-  __shared__ T block_sum;
-  if (block.thread_rank() == 0) { block_sum = 0; }
-  block.sync();
-
-  device_sum<T>(block, data, size, &block_sum);
+  auto block_sum = device_sum<T>(block, data, size);
   return static_cast<double>(block_sum) / static_cast<double>(size);
 }
 
@@ -443,17 +439,11 @@ __device__ double BlockCoVar(T const* lhs, T const* rhs, int64_t size)
 
   __shared__ double block_covar;
 
-  __shared__ T block_sum_lhs;
-  __shared__ T block_sum_rhs;
-
-  if (block.thread_rank() == 0) {
-    block_covar   = 0;
-    block_sum_lhs = 0;
-    block_sum_rhs = 0;
-  }
+  if (block.thread_rank() == 0) { block_covar = 0; }
   block.sync();
 
-  device_sum<T>(block, lhs, size, &block_sum_lhs);
+  auto block_sum_lhs = device_sum<T>(block, lhs, size);
+
   auto const mu_l = static_cast<double>(block_sum_lhs) / static_cast<double>(size);
   auto const mu_r = [=]() {
     if (lhs == rhs) {
@@ -461,7 +451,7 @@ __device__ double BlockCoVar(T const* lhs, T const* rhs, int64_t size)
       // Thus we can assume mu_r = mu_l.
       return mu_l;
     } else {
-      device_sum<T>(block, rhs, size, &block_sum_rhs);
+      auto block_sum_rhs = device_sum<T>(block, rhs, size);
       return static_cast<double>(block_sum_rhs) / static_cast<double>(size);
     }
   }();

From 3c8ce98e00e5a2b686cda690620f2a519d2a8e3d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 28 Aug 2023 12:53:18 -0400
Subject: [PATCH 115/230] Use cudf::thread_index_type in strings custom kernels
 (#13968)

Adds `cudf::thread_index_type` usage when calculating the thread index in custom kernels in `src/strings/attributes.cu` and `src/strings/convert/convert_urls.cu`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13968
---
 cpp/src/strings/attributes.cu           | 10 ++--
 cpp/src/strings/convert/convert_urls.cu | 68 ++++++++++++-------------
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 3a1b7044b56..8dc150998ee 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -111,14 +111,14 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
 __global__ void count_characters_parallel_fn(column_device_view const d_strings,
                                              size_type* d_lengths)
 {
-  size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
-  using warp_reduce   = cub::WarpReduce<size_type>;
+  auto const idx    = cudf::detail::grid_1d::global_thread_id();
+  using warp_reduce = cub::WarpReduce<size_type>;
   __shared__ typename warp_reduce::TempStorage temp_storage;
 
   if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
 
-  auto const str_idx  = idx / cudf::detail::warp_size;
-  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const str_idx  = static_cast<size_type>(idx / cudf::detail::warp_size);
+  auto const lane_idx = static_cast<size_type>(idx % cudf::detail::warp_size);
   if (d_strings.is_null(str_idx)) {
     d_lengths[str_idx] = 0;
     return;
@@ -126,7 +126,7 @@ __global__ void count_characters_parallel_fn(column_device_view const d_strings,
   auto const d_str   = d_strings.element<string_view>(str_idx);
   auto const str_ptr = d_str.data();
 
-  auto count = 0;
+  size_type count = 0;
   for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
     count += static_cast<size_type>(is_begin_utf8_char(str_ptr[i]));
   }
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 401a04cdc9d..71b6c09310e 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -195,7 +195,7 @@ __forceinline__ __device__ char escaped_sequence_to_byte(char const* const ptr)
  * @param[in] in_strings Input string column.
  * @param[out] out_counts Number of characters in each decode URL.
  */
-template <int num_warps_per_threadblock, int char_block_size>
+template <size_type num_warps_per_threadblock, size_type char_block_size>
 __global__ void url_decode_char_counter(column_device_view const in_strings,
                                         size_type* const out_counts)
 {
@@ -203,12 +203,12 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
   __shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
 
-  int const global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int const global_warp_id   = global_thread_id / cudf::detail::warp_size;
-  int const local_warp_id    = threadIdx.x / cudf::detail::warp_size;
-  int const warp_lane        = threadIdx.x % cudf::detail::warp_size;
-  int const nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
-  char* in_chars_shared      = temporary_buffer[local_warp_id];
+  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
+  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
+  char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp.
   for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) {
@@ -220,11 +220,11 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
     auto const in_string     = in_strings.element<string_view>(row_idx);
     auto const in_chars      = in_string.data();
     auto const string_length = in_string.size_bytes();
-    int const nblocks        = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
+    auto const nblocks       = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
     size_type escape_char_count = 0;
 
-    for (int block_idx = 0; block_idx < nblocks; block_idx++) {
-      int const string_length_block =
+    for (size_type block_idx = 0; block_idx < nblocks; block_idx++) {
+      auto const string_length_block =
         std::min(char_block_size, string_length - char_block_size * block_idx);
 
       // Each warp collectively loads input characters of the current block to the shared memory.
@@ -233,18 +233,18 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
       // are added after the end of the block. If the cell is beyond the end of the string, 0s are
       // filled in to make sure the last two characters of the string are not the start of an
       // escaped sequence.
-      for (int char_idx = warp_lane; char_idx < string_length_block + halo_size;
+      for (auto char_idx = warp_lane; char_idx < string_length_block + halo_size;
            char_idx += cudf::detail::warp_size) {
-        int const in_idx          = block_idx * char_block_size + char_idx;
+        auto const in_idx         = block_idx * char_block_size + char_idx;
         in_chars_shared[char_idx] = in_idx < string_length ? in_chars[in_idx] : 0;
       }
 
       __syncwarp();
 
       // `char_idx_start` represents the start character index of the current warp.
-      for (int char_idx_start = 0; char_idx_start < string_length_block;
+      for (size_type char_idx_start = 0; char_idx_start < string_length_block;
            char_idx_start += cudf::detail::warp_size) {
-        int const char_idx = char_idx_start + warp_lane;
+        auto const char_idx = char_idx_start + warp_lane;
         int8_t const is_ichar_escape_char =
           (char_idx < string_length_block && is_escape_char(in_chars_shared + char_idx)) ? 1 : 0;
 
@@ -277,7 +277,7 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
  * @param[out] out_chars Character buffer for the output string column.
  * @param[in] out_offsets Offset value of each string associated with `out_chars`.
  */
-template <int num_warps_per_threadblock, int char_block_size>
+template <size_type num_warps_per_threadblock, size_type char_block_size>
 __global__ void url_decode_char_replacer(column_device_view const in_strings,
                                          char* const out_chars,
                                          size_type const* const out_offsets)
@@ -285,14 +285,14 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings,
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2];
   __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
-  __shared__ int out_idx[num_warps_per_threadblock];
+  __shared__ size_type out_idx[num_warps_per_threadblock];
 
-  int const global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int const global_warp_id   = global_thread_id / cudf::detail::warp_size;
-  int const local_warp_id    = threadIdx.x / cudf::detail::warp_size;
-  int const warp_lane        = threadIdx.x % cudf::detail::warp_size;
-  int const nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
-  char* in_chars_shared      = temporary_buffer[local_warp_id];
+  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
+  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
+  char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp
   for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) {
@@ -302,31 +302,31 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings,
     auto const in_chars      = in_string.data();
     auto const string_length = in_string.size_bytes();
     auto out_chars_string    = out_chars + out_offsets[row_idx];
-    int const nblocks        = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
+    auto const nblocks       = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
 
     // Use the last thread of the warp to initialize `out_idx` to 0.
     if (warp_lane == cudf::detail::warp_size - 1) { out_idx[local_warp_id] = 0; }
 
-    for (int block_idx = 0; block_idx < nblocks; block_idx++) {
-      int const string_length_block =
+    for (size_type block_idx = 0; block_idx < nblocks; block_idx++) {
+      auto const string_length_block =
         std::min(char_block_size, string_length - char_block_size * block_idx);
 
       // Each warp collectively loads input characters of the current block to shared memory.
       // Two halo cells before and after the block are added. The halo cells are used to test
       // whether the current location as well as the previous two locations are escape characters,
       // without branches.
-      for (int char_idx = warp_lane; char_idx < string_length_block + halo_size * 2;
+      for (auto char_idx = warp_lane; char_idx < string_length_block + halo_size * 2;
            char_idx += cudf::detail::warp_size) {
-        int const in_idx          = block_idx * char_block_size + char_idx - halo_size;
+        auto const in_idx         = block_idx * char_block_size + char_idx - halo_size;
         in_chars_shared[char_idx] = in_idx >= 0 && in_idx < string_length ? in_chars[in_idx] : 0;
       }
 
       __syncwarp();
 
       // `char_idx_start` represents the start character index of the current warp.
-      for (int char_idx_start = 0; char_idx_start < string_length_block;
+      for (size_type char_idx_start = 0; char_idx_start < string_length_block;
            char_idx_start += cudf::detail::warp_size) {
-        int const char_idx = char_idx_start + warp_lane;
+        auto const char_idx = char_idx_start + warp_lane;
         // If the current character is part of an escape sequence starting at the previous two
         // locations, the thread with the starting location should output the escaped character, and
         // the current thread should not output a character.
@@ -375,10 +375,10 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  constexpr int num_warps_per_threadblock = 4;
-  constexpr int threadblock_size          = num_warps_per_threadblock * cudf::detail::warp_size;
-  constexpr int char_block_size           = 256;
-  int const num_threadblocks =
+  constexpr size_type num_warps_per_threadblock = 4;
+  constexpr size_type threadblock_size = num_warps_per_threadblock * cudf::detail::warp_size;
+  constexpr size_type char_block_size  = 256;
+  auto const num_threadblocks =
     std::min(65536, cudf::util::div_rounding_up_unsafe(strings_count, num_warps_per_threadblock));
 
   auto offset_count    = strings_count + 1;
@@ -386,7 +386,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
   // build offsets column
   auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, offset_count, mask_state::UNALLOCATED, stream, mr);
 
   // count number of bytes in each string after decoding and store it in offsets_column
   auto offsets_view         = offsets_column->view();

From 70fbec809a45fb4d462d7f3ef22464d00d2640e0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 28 Aug 2023 20:32:38 -0500
Subject: [PATCH 116/230] Expose streams in public concatenate APIs (#13987)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/cudf/pull/13987
---
 cpp/include/cudf/concatenate.hpp         |  7 ++++
 cpp/include/cudf_test/column_wrapper.hpp | 10 ++---
 cpp/src/copying/concatenate.cu           |  9 +++--
 cpp/tests/CMakeLists.txt                 |  1 +
 cpp/tests/streams/concatenate_test.cpp   | 51 ++++++++++++++++++++++++
 5 files changed, 69 insertions(+), 9 deletions(-)
 create mode 100644 cpp/tests/streams/concatenate_test.cpp

diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 11c6a02c225..9ee55275a5e 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -40,10 +41,12 @@ namespace cudf {
  *
  * @param views Column views whose bitmasks will be concatenated
  * @param mr Device memory resource used for allocating the returned memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return Bitmasks of all the column views in the views vector
  */
 rmm::device_buffer concatenate_masks(
   host_span<column_view const> views,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -53,12 +56,14 @@ rmm::device_buffer concatenate_masks(
  * @throws std::overflow_error If the total number of output rows exceeds cudf::size_type
  *
  * @param columns_to_concat Column views to be concatenated into a single column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A single column having all the rows from the elements of `columns_to_concat` respectively
  * in the same order.
  */
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns_to_concat,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -80,12 +85,14 @@ std::unique_ptr<column> concatenate(
  * @throws std::overflow_error If the total number of output rows exceeds cudf::size_type
  *
  * @param tables_to_concat Table views to be concatenated into a single table
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return A single table having all the rows from the elements of
  * `tables_to_concat` respectively in the same order.
  */
 std::unique_ptr<table> concatenate(
   host_span<table_view const> tables_to_concat,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 1e311322de1..cc8cac35ef4 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1596,12 +1596,10 @@ class lists_column_wrapper : public detail::column_wrapper {
     thrust::copy_if(
       std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
 
-    // TODO: Once the public concatenate API exposes streams, use that instead.
-    auto data =
-      children.empty()
-        ? cudf::empty_like(expected_hierarchy)
-        : cudf::detail::concatenate(
-            children, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto data = children.empty() ? cudf::empty_like(expected_hierarchy)
+                                 : cudf::concatenate(children,
+                                                     cudf::test::get_default_stream(),
+                                                     rmm::mr::get_current_device_resource());
 
     // increment depth
     depth = expected_depth + 1;
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index a53ec295512..35f06e47436 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -574,25 +574,28 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
 }  // namespace detail
 
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_masks(views, cudf::get_default_stream(), mr);
+  return detail::concatenate_masks(views, stream, mr);
 }
 
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(columns_to_concat, cudf::get_default_stream(), mr);
+  return detail::concatenate(columns_to_concat, stream, mr);
 }
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(tables_to_concat, cudf::get_default_stream(), mr);
+  return detail::concatenate(tables_to_concat, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d9e34c739ea..c97e2a58ca4 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -623,6 +623,7 @@ ConfigureTest(
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/concatenate_test.cpp b/cpp/tests/streams/concatenate_test.cpp
new file mode 100644
index 00000000000..6e6ff58686f
--- /dev/null
+++ b/cpp/tests/streams/concatenate_test.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/concatenate.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class ConcatenateTest : public cudf::test::BaseFixture {};
+
+TEST_F(ConcatenateTest, Column)
+{
+  cudf::test::fixed_width_column_wrapper<int> const input1({0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int> const input2({1, 1, 1, 1, 1});
+  std::vector<cudf::column_view> views{input1, input2};
+  auto result = cudf::concatenate(views, cudf::test::get_default_stream());
+}
+
+TEST_F(ConcatenateTest, Table)
+{
+  cudf::test::fixed_width_column_wrapper<int> const input1({0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int> const input2({1, 1, 1, 1, 1});
+  cudf::table_view tbl1({input1, input2});
+  cudf::table_view tbl2({input2, input1});
+  std::vector<cudf::table_view> views{tbl1, tbl2};
+  auto result = cudf::concatenate(views, cudf::test::get_default_stream());
+}
+
+TEST_F(ConcatenateTest, Masks)
+{
+  cudf::test::fixed_width_column_wrapper<int> const input1(
+    {{0, 0, 0, 0, 0}, {false, false, false, false, false}});
+  cudf::test::fixed_width_column_wrapper<int> const input2(
+    {{0, 0, 0, 0, 0}, {true, true, true, true, true}});
+  std::vector<cudf::column_view> views{input1, input2};
+  auto result = cudf::concatenate_masks(views, cudf::test::get_default_stream());
+}

From cd56cc2b4cc47a1d0c63e56fac945a66905c28df Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Tue, 29 Aug 2023 10:45:53 -0400
Subject: [PATCH 117/230] Use `copy-pr-bot` (#13970)

This PR replaces the `copy_prs` functionality from the `ops-bot` with the new dedicated `copy-pr-bot` GitHub application.

Thorough documentation for the new `copy-pr-bot` application can be viewed below.

- https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

**Important**: `copy-pr-bot` enforces signed commits. If an organization member opens a PR that contains unsigned commits, it will be deemed untrusted and therefore require an `/ok to test` comment. See the GitHub docs [here](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) for information on how to set up commit signing.

Any time a PR is deemed untrusted, it will receive a comment that looks like this: https://github.com/rapidsai/ci-imgs/pull/63#issuecomment-1688973208.

Every subsequent commit on an untrusted PR will require an additional `/ok to test` comment.

Any existing PRs that have unsigned commits after this change is merged will require an `/ok to test` comment for each subsequent commit _or_ the PR can be rebased to include signed commits as mentioned in the docs below:
https://docs.gha-runners.nvidia.com/cpr/contributors.

This information is all included on the documentation page linked above.

_I've skipped CI on this PR since it's not a change that is tested._

[skip ci]
---
 .github/copy-pr-bot.yaml | 4 ++++
 .github/ops-bot.yaml     | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 .github/copy-pr-bot.yaml

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 00000000000..895ba83ee54
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 2d1444c595d..9a0b4155035 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,5 +5,4 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-copy_prs: true
 recently_updated: true

From e2e92c46741ea6ef71a657a2cdbc3c010497943e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 29 Aug 2023 13:04:25 -0500
Subject: [PATCH 118/230] Expose streams in public filling APIs (#13990)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13990
---
 cpp/include/cudf/filling.hpp                  | 17 ++++-
 cpp/src/filling/calendrical_month_sequence.cu |  5 +-
 cpp/src/filling/fill.cu                       |  8 +-
 cpp/src/filling/repeat.cu                     |  6 +-
 cpp/src/filling/sequence.cu                   |  6 +-
 cpp/tests/CMakeLists.txt                      |  1 +
 cpp/tests/streams/filling_test.cpp            | 76 +++++++++++++++++++
 7 files changed, 109 insertions(+), 10 deletions(-)
 create mode 100644 cpp/tests/streams/filling_test.cpp

diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index a82bb9d1a48..1268f488919 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -54,11 +55,13 @@ namespace cudf {
  * @param begin The starting index of the fill range (inclusive)
  * @param end The index of the last element in the fill range (exclusive)
  * @param value The scalar value to fill
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void fill_in_place(mutable_column_view& destination,
                    size_type begin,
                    size_type end,
-                   scalar const& value);
+                   scalar const& value,
+                   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Fills a range of elements in a column out-of-place with a scalar
@@ -79,6 +82,7 @@ void fill_in_place(mutable_column_view& destination,
  * @param begin The starting index of the fill range (inclusive)
  * @param end The index of the last element in the fill range (exclusive)
  * @param value The scalar value to fill
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The result output column
  */
@@ -87,6 +91,7 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   scalar const& value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -113,12 +118,14 @@ std::unique_ptr<column> fill(
  *
  * @param input_table Input table
  * @param count Non-nullable column of an integral type
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The result table containing the repetitions
  */
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -136,12 +143,14 @@ std::unique_ptr<table> repeat(
  *
  * @param input_table Input table
  * @param count Number of repetitions
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The result table containing the repetitions
  */
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -164,6 +173,7 @@ std::unique_ptr<table> repeat(
  * @param size Size of the output column
  * @param init First value in the sequence
  * @param step Increment value
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The result column containing the generated sequence
  */
@@ -171,6 +181,7 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   scalar const& step,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -190,12 +201,14 @@ std::unique_ptr<column> sequence(
  *
  * @param size Size of the output column
  * @param init First value in the sequence
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The result column containing the generated sequence
  */
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -217,6 +230,7 @@ std::unique_ptr<column> sequence(
  * @param size Number of timestamps to generate
  * @param init The initial timestamp
  * @param months Months to increment
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @return Timestamps column with sequences of months
@@ -225,6 +239,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   size_type size,
   scalar const& init,
   size_type months,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index f45634a615e..80badb7d566 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,10 +40,11 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
 std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
+                                                         rmm::cuda_stream_view stream,
                                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::calendrical_month_sequence(size, init, months, cudf::get_default_stream(), mr);
+  return detail::calendrical_month_sequence(size, init, months, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 342392c773e..3d84db121fc 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -246,20 +246,22 @@ std::unique_ptr<column> fill(column_view const& input,
 void fill_in_place(mutable_column_view& destination,
                    size_type begin,
                    size_type end,
-                   scalar const& value)
+                   scalar const& value,
+                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::fill_in_place(destination, begin, end, value, cudf::get_default_stream());
+  return detail::fill_in_place(destination, begin, end, value, stream);
 }
 
 std::unique_ptr<column> fill(column_view const& input,
                              size_type begin,
                              size_type end,
                              scalar const& value,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::fill(input, begin, end, value, cudf::get_default_stream(), mr);
+  return detail::fill(input, begin, end, value, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 2be15a06c0d..677d9a09515 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -156,18 +156,20 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat(input_table, count, cudf::get_default_stream(), mr);
+  return detail::repeat(input_table, count, stream, mr);
 }
 
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat(input_table, count, cudf::get_default_stream(), mr);
+  return detail::repeat(input_table, count, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index b4bab369c61..99a17f8b0e0 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -150,18 +150,20 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequence(size, init, step, cudf::get_default_stream(), mr);
+  return detail::sequence(size, init, step, stream, mr);
 }
 
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequence(size, init, cudf::get_default_stream(), mr);
+  return detail::sequence(size, init, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c97e2a58ca4..8a0aa27b175 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -624,6 +624,7 @@ ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/filling_test.cpp b/cpp/tests/streams/filling_test.cpp
new file mode 100644
index 00000000000..b822743d4ca
--- /dev/null
+++ b/cpp/tests/streams/filling_test.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class FillingTest : public cudf::test::BaseFixture {};
+
+TEST_F(FillingTest, FillInPlace)
+{
+  cudf::test::fixed_width_column_wrapper<int> col({0, 0, 0, 0, 0});
+  auto scalar = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  cudf::mutable_column_view mut_view = col;
+  cudf::fill_in_place(mut_view, 0, 4, scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, Fill)
+{
+  cudf::test::fixed_width_column_wrapper<int> const col({0, 0, 0, 0, 0});
+  auto scalar = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  cudf::fill(col, 0, 4, scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, RepeatVariable)
+{
+  cudf::test::fixed_width_column_wrapper<int> const col({0, 0, 0, 0, 0});
+  cudf::table_view const table({col});
+  cudf::test::fixed_width_column_wrapper<int> const counts({1, 2, 3, 4, 5});
+  cudf::repeat(table, counts, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, RepeatConst)
+{
+  cudf::test::fixed_width_column_wrapper<int> const col({0, 0, 0, 0, 0});
+  cudf::table_view const table({col});
+  cudf::repeat(table, 5, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, SequenceStep)
+{
+  auto init = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  auto step = cudf::numeric_scalar<int>(2, true, cudf::test::get_default_stream());
+  cudf::sequence(10, init, step, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, Sequence)
+{
+  auto init = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  cudf::sequence(10, init, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, CalendricalMonthSequence)
+{
+  cudf::timestamp_scalar<cudf::timestamp_s> init(
+    1629852896L, true, cudf::test::get_default_stream());  // 2021-08-25 00:54:56 GMT
+
+  cudf::calendrical_month_sequence(10, init, 2, cudf::test::get_default_stream());
+}

From 14522003f3bbd8041e66b1ff34077acdae4869ba Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Tue, 29 Aug 2023 14:33:57 -0500
Subject: [PATCH 119/230] Use cudf::thread_index_type in get_json_object and
 tdigest kernels (#13962)

Convert the grid-stride loop in `get_json_object_kernel` to use `cudf::thread_index_type`.  Convert `compute_percentiles_kernel` to use `cudf::thread_index_type`.

## Checklist
- [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.

Authors:
   - https://github.com/nvdbaranec

Approvers:
   - Bradley Dice (https://github.com/bdice)
   - MithunR (https://github.com/mythrocks)
   - Mike Wilson (https://github.com/hyperbolic2346)
   - Karthikeyan (https://github.com/karthikeyann)
---
 cpp/src/quantiles/tdigest/tdigest.cu | 2 +-
 cpp/src/strings/json/json_path.cu    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index cfdb386ff64..79a25f79f60 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -74,7 +74,7 @@ __global__ void compute_percentiles_kernel(device_span<size_type const> tdigest_
                                            double const* cumulative_weight_,
                                            double* output)
 {
-  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   auto const num_tdigests  = tdigest_offsets.size() - 1;
   auto const tdigest_index = tid / percentiles.size();
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index be5b089c6e0..2d2691e0518 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -907,8 +907,8 @@ __launch_bounds__(block_size) __global__
                               thrust::optional<size_type*> out_valid_count,
                               get_json_object_options options)
 {
-  size_type tid    = threadIdx.x + (blockDim.x * blockIdx.x);
-  size_type stride = blockDim.x * gridDim.x;
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
 
   size_type warp_valid_count{0};
 

From 7b9f4a17579befd902d1c30af38daa5fe493e335 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Tue, 29 Aug 2023 20:59:04 -0700
Subject: [PATCH 120/230] Use HostMemoryAllocator in jni::allocate_host_buffer
 (#13975)

Fixes #13940
Contributes to NVIDIA/spark-rapids#8889

- Pass an explicit host memory allocator to  `jni::allocate_host_buffer`
- Consistently check for errors from NewGlobalRef
- Consistently guard against DelteteGlobalRef on a null

Authors:
  - Gera Shegalov (https://github.com/gerashegalov)

Approvers:
  - https://github.com/nvdbaranec
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/13975
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 84 +++++++++++++++----
 java/src/main/native/include/jni_utils.hpp    | 15 ++++
 .../main/native/src/ContiguousTableJni.cpp    | 10 +--
 java/src/main/native/src/CudfJni.cpp          | 21 ++---
 java/src/main/native/src/RmmJni.cpp           |  7 +-
 java/src/main/native/src/TableJni.cpp         | 59 ++++++-------
 java/src/main/native/src/cudf_jni_apis.hpp    |  3 +-
 .../main/native/src/jni_writer_data_sink.hpp  | 29 +++----
 8 files changed, 133 insertions(+), 95 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 57189b052b6..b2eb33d47dc 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -336,7 +336,9 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      boolean[] isBinaryValues,
                                                      boolean[] hasParquetFieldIds,
                                                      int[] parquetFieldIds,
-                                                     HostBufferConsumer consumer) throws CudfException;
+                                                     HostBufferConsumer consumer,
+                                                     HostMemoryAllocator hostMemoryAllocator
+                                                     ) throws CudfException;
 
   /**
    * Write out a table to an open handle.
@@ -419,7 +421,9 @@ private static native long writeORCBufferBegin(String[] columnNames,
                                                  int compression,
                                                  int[] precisions,
                                                  boolean[] isMapValues,
-                                                 HostBufferConsumer consumer) throws CudfException;
+                                                 HostBufferConsumer consumer,
+                                                 HostMemoryAllocator hostMemoryAllocator
+                                                 ) throws CudfException;
 
   /**
    * Write out a table to an open handle.
@@ -447,10 +451,12 @@ private static native long writeORCBufferBegin(String[] columnNames,
    * Setup everything to write Arrow IPC formatted data to a buffer.
    * @param columnNames names that correspond to the table columns
    * @param consumer consumer of host buffers produced.
+   * @param hostMemoryAllocator allocator for host memory buffers.
    * @return a handle that is used in later calls to writeArrowIPCChunk and writeArrowIPCEnd.
    */
   private static native long writeArrowIPCBufferBegin(String[] columnNames,
-                                                      HostBufferConsumer consumer);
+                                                      HostBufferConsumer consumer,
+                                                      HostMemoryAllocator hostMemoryAllocator);
 
   /**
    * Convert a cudf table to an arrow table handle.
@@ -906,7 +912,9 @@ private static native long startWriteCSVToBuffer(String[] columnNames,
                                                    String trueValue,
                                                    String falseValue,
                                                    int quoteStyle,
-                                                   HostBufferConsumer buffer) throws CudfException;
+                                                   HostBufferConsumer buffer,
+                                                   HostMemoryAllocator hostMemoryAllocator
+                                                   ) throws CudfException;
 
   private static native void writeCSVChunkToBuffer(long writerHandle, long tableHandle);
 
@@ -915,7 +923,8 @@ private static native long startWriteCSVToBuffer(String[] columnNames,
   private static class CSVTableWriter extends TableWriter {
     private HostBufferConsumer consumer;
 
-    private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer) {
+    private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer,
+        HostMemoryAllocator hostMemoryAllocator) {
       super(startWriteCSVToBuffer(options.getColumnNames(),
           options.getIncludeHeader(),
           options.getRowDelimiter(),
@@ -924,7 +933,7 @@ private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer) {
           options.getTrueValue(),
           options.getFalseValue(),
           options.getQuoteStyle().nativeId,
-          consumer));
+          consumer, hostMemoryAllocator));
       this.consumer = consumer;
     }
 
@@ -949,8 +958,14 @@ public void close() throws CudfException {
     }
   }
 
-  public static TableWriter getCSVBufferWriter(CSVWriterOptions options, HostBufferConsumer bufferConsumer) {
-    return new CSVTableWriter(options, bufferConsumer);
+  public static TableWriter getCSVBufferWriter(CSVWriterOptions options,
+      HostBufferConsumer bufferConsumer, HostMemoryAllocator hostMemoryAllocator) {
+    return new CSVTableWriter(options, bufferConsumer, hostMemoryAllocator);
+  }
+
+   public static TableWriter getCSVBufferWriter(CSVWriterOptions options,
+      HostBufferConsumer bufferConsumer) {
+    return getCSVBufferWriter(options, bufferConsumer, DefaultHostMemoryAllocator.get());
   }
 
   /**
@@ -1393,7 +1408,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
       this.consumer = null;
     }
 
-    private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer consumer) {
+    private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer consumer,
+        HostMemoryAllocator hostMemoryAllocator) {
       super(writeParquetBufferBegin(options.getFlatColumnNames(),
           options.getTopLevelChildren(),
           options.getFlatNumChildren(),
@@ -1408,7 +1424,7 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           options.getFlatIsBinary(),
           options.getFlatHasParquetFieldId(),
           options.getFlatParquetFieldId(),
-          consumer));
+          consumer, hostMemoryAllocator));
       this.consumer = consumer;
     }
 
@@ -1448,11 +1464,18 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options, File
    * @param options the parquet writer options.
    * @param consumer a class that will be called when host buffers are ready with parquet
    *                 formatted data in them.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return a table writer to use for writing out multiple tables.
    */
+  public static TableWriter writeParquetChunked(ParquetWriterOptions options,
+                                                HostBufferConsumer consumer,
+                                                HostMemoryAllocator hostMemoryAllocator) {
+    return new ParquetTableWriter(options, consumer, hostMemoryAllocator);
+  }
+
   public static TableWriter writeParquetChunked(ParquetWriterOptions options,
                                                 HostBufferConsumer consumer) {
-    return new ParquetTableWriter(options, consumer);
+    return writeParquetChunked(options, consumer, DefaultHostMemoryAllocator.get());
   }
 
   /**
@@ -1461,10 +1484,12 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options,
    * @param options the Parquet writer options.
    * @param consumer a class that will be called when host buffers are ready with Parquet
    *                 formatted data in them.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @param columnViews ColumnViews to write to Parquet
    */
   public static void writeColumnViewsToParquet(ParquetWriterOptions options,
                                                HostBufferConsumer consumer,
+                                               HostMemoryAllocator hostMemoryAllocator,
                                                ColumnView... columnViews) {
     assert columnViews != null && columnViews.length > 0 : "ColumnViews can't be null or empty";
     long rows = columnViews[0].getRowCount();
@@ -1483,7 +1508,9 @@ public static void writeColumnViewsToParquet(ParquetWriterOptions options,
 
     long nativeHandle = createCudfTableView(viewPointers);
     try {
-      try (ParquetTableWriter writer = new ParquetTableWriter(options, consumer)) {
+      try (
+        ParquetTableWriter writer = new ParquetTableWriter(options, consumer, hostMemoryAllocator)
+      ) {
         long total = 0;
         for (ColumnView cv : columnViews) {
           total += cv.getDeviceMemorySize();
@@ -1495,6 +1522,12 @@ public static void writeColumnViewsToParquet(ParquetWriterOptions options,
     }
   }
 
+  public static void writeColumnViewsToParquet(ParquetWriterOptions options,
+                                               HostBufferConsumer consumer,
+                                               ColumnView... columnViews) {
+    writeColumnViewsToParquet(options, consumer, DefaultHostMemoryAllocator.get(), columnViews);
+  }
+
   private static class ORCTableWriter extends TableWriter {
     HostBufferConsumer consumer;
 
@@ -1512,7 +1545,8 @@ private ORCTableWriter(ORCWriterOptions options, File outputFile) {
       this.consumer = null;
     }
 
-    private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer) {
+    private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer,
+        HostMemoryAllocator hostMemoryAllocator) {
       super(writeORCBufferBegin(options.getFlatColumnNames(),
           options.getTopLevelChildren(),
           options.getFlatNumChildren(),
@@ -1522,7 +1556,7 @@ private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer) {
           options.getCompressionType().nativeId,
           options.getFlatPrecision(),
           options.getFlatIsMap(),
-          consumer));
+          consumer, hostMemoryAllocator));
       this.consumer = consumer;
     }
 
@@ -1562,10 +1596,16 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, File outputF
    * @param options the ORC writer options.
    * @param consumer a class that will be called when host buffers are ready with ORC
    *                 formatted data in them.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return a table writer to use for writing out multiple tables.
    */
+  public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferConsumer consumer,
+      HostMemoryAllocator hostMemoryAllocator) {
+    return new ORCTableWriter(options, consumer, hostMemoryAllocator);
+  }
+
   public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferConsumer consumer) {
-    return new ORCTableWriter(options, consumer);
+    return writeORCChunked(options, consumer, DefaultHostMemoryAllocator.get());
   }
 
   private static class ArrowIPCTableWriter extends TableWriter {
@@ -1580,8 +1620,9 @@ private ArrowIPCTableWriter(ArrowIPCWriterOptions options, File outputFile) {
       this.maxChunkSize = options.getMaxChunkSize();
     }
 
-    private ArrowIPCTableWriter(ArrowIPCWriterOptions options, HostBufferConsumer consumer) {
-      super(writeArrowIPCBufferBegin(options.getColumnNames(), consumer));
+    private ArrowIPCTableWriter(ArrowIPCWriterOptions options, HostBufferConsumer consumer,
+        HostMemoryAllocator hostMemoryAllocator) {
+      super(writeArrowIPCBufferBegin(options.getColumnNames(), consumer, hostMemoryAllocator));
       this.callback = options.getCallback();
       this.consumer = consumer;
       this.maxChunkSize = options.getMaxChunkSize();
@@ -1629,11 +1670,18 @@ public static TableWriter writeArrowIPCChunked(ArrowIPCWriterOptions options, Fi
    * @param options the arrow IPC writer options.
    * @param consumer a class that will be called when host buffers are ready with arrow IPC
    *                 formatted data in them.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return a table writer to use for writing out multiple tables.
    */
+  public static TableWriter writeArrowIPCChunked(ArrowIPCWriterOptions options,
+                                                 HostBufferConsumer consumer,
+                                                 HostMemoryAllocator hostMemoryAllocator) {
+    return new ArrowIPCTableWriter(options, consumer, hostMemoryAllocator);
+  }
+
   public static TableWriter writeArrowIPCChunked(ArrowIPCWriterOptions options,
                                                  HostBufferConsumer consumer) {
-    return new ArrowIPCTableWriter(options, consumer);
+    return writeArrowIPCChunked(options, consumer, DefaultHostMemoryAllocator.get());
   }
 
   private static class ArrowReaderWrapper implements AutoCloseable {
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index ff4da893329..f342fca8933 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -786,6 +786,21 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
   }
 }
 
+inline auto add_global_ref(JNIEnv *env, jobject jobj) {
+  auto new_global_ref = env->NewGlobalRef(jobj);
+  if (new_global_ref == nullptr) {
+    throw cudf::jni::jni_exception("global ref");
+  }
+  return new_global_ref;
+}
+
+inline nullptr_t del_global_ref(JNIEnv *env, jobject jobj) {
+  if (jobj != nullptr) {
+    env->DeleteGlobalRef(jobj);
+  }
+  return nullptr;
+}
+
 } // namespace jni
 } // namespace cudf
 
diff --git a/java/src/main/native/src/ContiguousTableJni.cpp b/java/src/main/native/src/ContiguousTableJni.cpp
index 7eddea2a895..8c99c77ca1f 100644
--- a/java/src/main/native/src/ContiguousTableJni.cpp
+++ b/java/src/main/native/src/ContiguousTableJni.cpp
@@ -55,10 +55,7 @@ bool cache_contiguous_table_jni(JNIEnv *env) {
 }
 
 void release_contiguous_table_jni(JNIEnv *env) {
-  if (Contiguous_table_jclass != nullptr) {
-    env->DeleteGlobalRef(Contiguous_table_jclass);
-    Contiguous_table_jclass = nullptr;
-  }
+  Contiguous_table_jclass = cudf::jni::del_global_ref(env, Contiguous_table_jclass);
 }
 
 bool cache_contig_split_group_by_result_jni(JNIEnv *env) {
@@ -87,10 +84,7 @@ bool cache_contig_split_group_by_result_jni(JNIEnv *env) {
 }
 
 void release_contig_split_group_by_result_jni(JNIEnv *env) {
-  if (Contig_split_group_by_result_jclass != nullptr) {
-    env->DeleteGlobalRef(Contig_split_group_by_result_jclass);
-    Contig_split_group_by_result_jclass = nullptr;
-  }
+  Contig_split_group_by_result_jclass = del_global_ref(env, Contig_split_group_by_result_jclass);
 }
 
 jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups) {
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index acbf309b4b7..0f143086451 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -46,7 +46,6 @@ constexpr bool is_ptds_enabled{false};
 #endif
 
 static jclass Host_memory_buffer_jclass;
-static jmethodID Host_buffer_allocate;
 static jfieldID Host_buffer_address;
 static jfieldID Host_buffer_length;
 
@@ -59,11 +58,6 @@ static bool cache_host_memory_buffer_jni(JNIEnv *env) {
     return false;
   }
 
-  Host_buffer_allocate = env->GetStaticMethodID(cls, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
-  if (Host_buffer_allocate == nullptr) {
-    return false;
-  }
-
   Host_buffer_address = env->GetFieldID(cls, "address", "J");
   if (Host_buffer_address == nullptr) {
     return false;
@@ -83,15 +77,16 @@ static bool cache_host_memory_buffer_jni(JNIEnv *env) {
 }
 
 static void release_host_memory_buffer_jni(JNIEnv *env) {
-  if (Host_memory_buffer_jclass != nullptr) {
-    env->DeleteGlobalRef(Host_memory_buffer_jclass);
-    Host_memory_buffer_jclass = nullptr;
-  }
+  Host_memory_buffer_jclass = del_global_ref(env, Host_memory_buffer_jclass);
 }
 
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned) {
-  jobject ret = env->CallStaticObjectMethod(Host_memory_buffer_jclass, Host_buffer_allocate, amount,
-                                            prefer_pinned);
+jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
+                             jobject host_memory_allocator) {
+  auto const host_memory_allocator_class = env->GetObjectClass(host_memory_allocator);
+  auto const allocateMethodId =
+      env->GetMethodID(host_memory_allocator_class, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
+  jobject ret =
+      env->CallObjectMethod(host_memory_allocator, allocateMethodId, amount, prefer_pinned);
 
   if (env->ExceptionCheck()) {
     throw std::runtime_error("allocateHostBuffer threw an exception");
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 5bbb5383d93..3c49d153cb6 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -197,10 +197,7 @@ class java_event_handler_memory_resource : public device_memory_resource {
     update_thresholds(env, alloc_thresholds, jalloc_thresholds);
     update_thresholds(env, dealloc_thresholds, jdealloc_thresholds);
 
-    handler_obj = env->NewGlobalRef(jhandler);
-    if (handler_obj == nullptr) {
-      throw cudf::jni::jni_exception("global ref");
-    }
+    handler_obj = cudf::jni::add_global_ref(env, jhandler);
   }
 
   virtual ~java_event_handler_memory_resource() {
@@ -209,7 +206,7 @@ class java_event_handler_memory_resource : public device_memory_resource {
     // already be destroyed and this thread should not try to attach to get an environment.
     JNIEnv *env = nullptr;
     if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      env->DeleteGlobalRef(handler_obj);
+      handler_obj = cudf::jni::del_global_ref(env, handler_obj);
     }
     handler_obj = nullptr;
   }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index d6ef2a1e26c..f7ada4305db 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -224,7 +224,7 @@ class native_arrow_ipc_writer_handle final {
 
 class jni_arrow_output_stream final : public arrow::io::OutputStream {
 public:
-  explicit jni_arrow_output_stream(JNIEnv *env, jobject callback) {
+  explicit jni_arrow_output_stream(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
     if (env->GetJavaVM(&jvm) < 0) {
       throw std::runtime_error("GetJavaVM failed");
     }
@@ -239,11 +239,8 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     if (handle_buffer_method == nullptr) {
       throw cudf::jni::jni_exception("handleBuffer method");
     }
-
-    this->callback = env->NewGlobalRef(callback);
-    if (this->callback == nullptr) {
-      throw cudf::jni::jni_exception("global ref");
-    }
+    this->callback = add_global_ref(env, callback);
+    this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
   virtual ~jni_arrow_output_stream() {
@@ -252,13 +249,13 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     // already be destroyed and this thread should not try to attach to get an environment.
     JNIEnv *env = nullptr;
     if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      env->DeleteGlobalRef(callback);
-      if (current_buffer != nullptr) {
-        env->DeleteGlobalRef(current_buffer);
-      }
+      callback = del_global_ref(env, callback);
+      current_buffer = del_global_ref(env, current_buffer);
+      host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
     callback = nullptr;
     current_buffer = nullptr;
+    host_memory_allocator = nullptr;
   }
 
   arrow::Status Write(const std::shared_ptr<arrow::Buffer> &data) override {
@@ -293,10 +290,7 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     if (current_buffer_written > 0) {
       JNIEnv *env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      if (current_buffer != nullptr) {
-        env->DeleteGlobalRef(current_buffer);
-      }
-      current_buffer = nullptr;
+      current_buffer = del_global_ref(env, current_buffer);
       current_buffer_len = 0;
       current_buffer_data = nullptr;
       current_buffer_written = 0;
@@ -323,11 +317,10 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
   void rotate_buffer(JNIEnv *env) {
     if (current_buffer != nullptr) {
       handle_buffer(env, current_buffer, current_buffer_written);
-      env->DeleteGlobalRef(current_buffer);
-      current_buffer = nullptr;
     }
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true);
-    current_buffer = env->NewGlobalRef(tmp_buffer);
+    current_buffer = del_global_ref(env, current_buffer);
+    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer = add_global_ref(env, tmp_buffer);
     current_buffer_len = get_host_buffer_length(env, current_buffer);
     current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
@@ -350,6 +343,7 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
   int64_t total_written = 0;
   long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
   bool is_closed = false;
+  jobject host_memory_allocator;
 };
 
 class jni_arrow_input_stream final : public arrow::io::InputStream {
@@ -370,10 +364,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
       throw cudf::jni::jni_exception("readInto method");
     }
 
-    this->callback = env->NewGlobalRef(callback);
-    if (this->callback == nullptr) {
-      throw cudf::jni::jni_exception("global ref");
-    }
+    this->callback = add_global_ref(env, callback);
   }
 
   virtual ~jni_arrow_input_stream() {
@@ -382,7 +373,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
     // already be destroyed and this thread should not try to attach to get an environment.
     JNIEnv *env = nullptr;
     if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      env->DeleteGlobalRef(callback);
+      callback = del_global_ref(env, callback);
     }
     callback = nullptr;
   }
@@ -1269,7 +1260,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
     JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header,
     jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value,
-    jstring j_false_value, jint j_quote_style, jobject j_buffer) {
+    jstring j_false_value, jint j_quote_style, jobject j_buffer, jobject host_memory_allocator) {
   JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", 0);
   JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", 0);
   JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", 0);
@@ -1279,7 +1270,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
   try {
     cudf::jni::auto_set_device(env);
 
-    auto data_sink = std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer);
+    auto data_sink =
+        std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer, host_memory_allocator);
 
     auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names};
     auto const column_names = n_column_names.as_cpp_vector();
@@ -1576,7 +1568,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
     jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
     jbooleanArray j_is_map, jbooleanArray j_is_binary, jbooleanArray j_hasParquetFieldIds,
-    jintArray j_parquetFieldIds, jobject consumer) {
+    jintArray j_parquetFieldIds, jobject consumer, jobject host_memory_allocator) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1584,7 +1576,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer));
+        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
 
     using namespace cudf::io;
     using namespace cudf::jni;
@@ -1755,7 +1747,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jobject consumer) {
+    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jobject consumer,
+    jobject host_memory_allocator) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1787,7 +1780,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
                    [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer));
+        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
     sink_info sink{data_sink.get()};
 
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
@@ -1918,9 +1911,9 @@ JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistic
   CATCH_STD(env, nullptr)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv *env, jclass,
-                                                                          jobjectArray j_col_names,
-                                                                          jobject consumer) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
+    JNIEnv *env, jclass, jobjectArray j_col_names, jobject consumer,
+    jobject host_memory_allocator) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
@@ -1928,7 +1921,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv
     cudf::jni::native_jstringArray col_names(env, j_col_names);
 
     std::shared_ptr<cudf::jni::jni_arrow_output_stream> data_sink(
-        new cudf::jni::jni_arrow_output_stream(env, consumer));
+        new cudf::jni::jni_arrow_output_stream(env, consumer, host_memory_allocator));
 
     cudf::jni::native_arrow_ipc_writer_handle *ret =
         new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index 18993aea294..867df80b722 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -100,7 +100,8 @@ jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
 /**
  * Allocate a HostMemoryBuffer
  */
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned);
+jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
+                             jobject host_memory_allocator);
 
 /**
  * Get the address of a HostMemoryBuffer
diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp
index 05fe594fcd5..efac6112c25 100644
--- a/java/src/main/native/src/jni_writer_data_sink.hpp
+++ b/java/src/main/native/src/jni_writer_data_sink.hpp
@@ -26,7 +26,7 @@ constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB
 
 class jni_writer_data_sink final : public cudf::io::data_sink {
 public:
-  explicit jni_writer_data_sink(JNIEnv *env, jobject callback) {
+  explicit jni_writer_data_sink(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
     if (env->GetJavaVM(&jvm) < 0) {
       throw std::runtime_error("GetJavaVM failed");
     }
@@ -42,10 +42,8 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
       throw cudf::jni::jni_exception("handleBuffer method");
     }
 
-    this->callback = env->NewGlobalRef(callback);
-    if (this->callback == nullptr) {
-      throw cudf::jni::jni_exception("global ref");
-    }
+    this->callback = add_global_ref(env, callback);
+    this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
   virtual ~jni_writer_data_sink() {
@@ -54,13 +52,13 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
     // already be destroyed and this thread should not try to attach to get an environment.
     JNIEnv *env = nullptr;
     if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      env->DeleteGlobalRef(callback);
-      if (current_buffer != nullptr) {
-        env->DeleteGlobalRef(current_buffer);
-      }
+      callback = del_global_ref(env, callback);
+      current_buffer = del_global_ref(env, current_buffer);
+      host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
     callback = nullptr;
     current_buffer = nullptr;
+    host_memory_allocator = nullptr;
   }
 
   void host_write(void const *data, size_t size) override {
@@ -126,10 +124,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
     if (current_buffer_written > 0) {
       JNIEnv *env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      if (current_buffer != nullptr) {
-        env->DeleteGlobalRef(current_buffer);
-      }
-      current_buffer = nullptr;
+      current_buffer = del_global_ref(env, current_buffer);
       current_buffer_len = 0;
       current_buffer_data = nullptr;
       current_buffer_written = 0;
@@ -144,11 +139,10 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
   void rotate_buffer(JNIEnv *env) {
     if (current_buffer != nullptr) {
       handle_buffer(env, current_buffer, current_buffer_written);
-      env->DeleteGlobalRef(current_buffer);
-      current_buffer = nullptr;
     }
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true);
-    current_buffer = env->NewGlobalRef(tmp_buffer);
+    current_buffer = del_global_ref(env, current_buffer);
+    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer = add_global_ref(env, tmp_buffer);
     current_buffer_len = get_host_buffer_length(env, current_buffer);
     current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
@@ -170,6 +164,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
   long current_buffer_written = 0;
   size_t total_written = 0;
   long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
+  jobject host_memory_allocator;
 };
 
 } // namespace cudf::jni

From 04085acf1ed43921b638ead432d654695b84d4ea Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 30 Aug 2023 09:55:38 -0500
Subject: [PATCH 121/230] Fix `name` selection in `Index.difference` and
 `Index.intersection` (#13986)

closes #13985
This PR fixes issues with `Index.difference` and `Index.intersection` API where the name selection was incorrect and `NA` values handling wasn't happening in these two APIs.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/13986
---
 python/cudf/cudf/core/_base_index.py | 21 +++++++++++----------
 python/cudf/cudf/tests/test_index.py | 16 ++++++++++------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d593f0df138..829ca33d8a5 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -651,7 +651,7 @@ def _get_reconciled_name_object(self, other):
         case make a shallow copy of self.
         """
         name = _get_result_name(self.name, other.name)
-        if self.name != name:
+        if not _is_same_name(self.name, name):
             return self.rename(name)
         return self
 
@@ -943,17 +943,18 @@ def difference(self, other, sort=None):
 
         other = cudf.Index(other)
 
+        res_name = _get_result_name(self.name, other.name)
+
         if is_mixed_with_object_dtype(self, other):
             difference = self.copy()
         else:
             other = other.copy(deep=False)
-            other.names = self.names
             difference = cudf.core.index._index_from_data(
-                cudf.DataFrame._from_data(self._data)
+                cudf.DataFrame._from_data({"None": self._column})
                 .merge(
-                    cudf.DataFrame._from_data(other._data),
+                    cudf.DataFrame._from_data({"None": other._column}),
                     how="leftanti",
-                    on=self.name,
+                    on="None",
                 )
                 ._data
             )
@@ -961,6 +962,8 @@ def difference(self, other, sort=None):
             if self.dtype != other.dtype:
                 difference = difference.astype(self.dtype)
 
+        difference.name = res_name
+
         if sort is None and len(other):
             return difference.sort_values()
 
@@ -1323,14 +1326,12 @@ def _union(self, other, sort=None):
         return union_result
 
     def _intersection(self, other, sort=None):
-        other_unique = other.unique()
-        other_unique.names = self.names
         intersection_result = cudf.core.index._index_from_data(
-            cudf.DataFrame._from_data(self.unique()._data)
+            cudf.DataFrame._from_data({"None": self.unique()._column})
             .merge(
-                cudf.DataFrame._from_data(other_unique._data),
+                cudf.DataFrame._from_data({"None": other.unique()._column}),
                 how="inner",
-                on=self.name,
+                on="None",
             )
             ._data
         )
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 2e6b45058ef..359b3c519de 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -804,12 +804,16 @@ def test_index_to_series(data):
     ],
 )
 @pytest.mark.parametrize("sort", [None, False])
-def test_index_difference(data, other, sort):
-    pd_data = pd.Index(data)
-    pd_other = pd.Index(other)
+@pytest.mark.parametrize(
+    "name_data,name_other",
+    [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")],
+)
+def test_index_difference(data, other, sort, name_data, name_other):
+    pd_data = pd.Index(data, name=name_data)
+    pd_other = pd.Index(other, name=name_other)
 
-    gd_data = cudf.core.index.as_index(data)
-    gd_other = cudf.core.index.as_index(other)
+    gd_data = cudf.from_pandas(pd_data)
+    gd_other = cudf.from_pandas(pd_other)
 
     expected = pd_data.difference(pd_other, sort=sort)
     actual = gd_data.difference(gd_other, sort=sort)
@@ -2066,7 +2070,7 @@ def test_union_index(idx1, idx2, sort):
         (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)),
         (pd.RangeIndex(0, 10), pd.RangeIndex(-10, 20)),
         (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")),
-        (pd.Index([0, 1, 2, 30], name="a"), pd.Index([30, 0, 90, 100])),
+        (pd.Index([0, 1, 2, 30], name=pd.NA), pd.Index([30, 0, 90, 100])),
         (pd.Index([0, 1, 2, 30], name="a"), [90, 100]),
         (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])),
         (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])),

From 9259a20bc37fc323af7bc80e72a5af27400f3b09 Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz@gmail.com>
Date: Wed, 30 Aug 2023 17:01:33 +0200
Subject: [PATCH 122/230] Add `bytes_per_second` to copy_if_else benchmark
 (#13960)

Adds `bytes_per_second` to the `COPY_IF_ELSE_BENCH` benchmark.

This patch relates to #13735.

Authors:
  - Martin Marenz (https://github.com/Blonck)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13960
---
 cpp/benchmarks/copying/copy_if_else.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/benchmarks/copying/copy_if_else.cpp b/cpp/benchmarks/copying/copy_if_else.cpp
index a10f54b3d6f..50ddfb82feb 100644
--- a/cpp/benchmarks/copying/copy_if_else.cpp
+++ b/cpp/benchmarks/copying/copy_if_else.cpp
@@ -47,6 +47,14 @@ static void BM_copy_if_else(benchmark::State& state, bool nulls)
     cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::copy_if_else(lhs, rhs, decision);
   }
+
+  auto const bytes_read    = n_rows * (sizeof(TypeParam) + sizeof(bool));
+  auto const bytes_written = n_rows * sizeof(TypeParam);
+  auto const null_bytes    = nulls ? 2 * cudf::bitmask_allocation_size_bytes(n_rows) : 0;
+
+  // Use number of bytes read and written.
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_read + bytes_written + null_bytes));
 }
 
 #define COPY_BENCHMARK_DEFINE(name, type, b)                  \

From e63f64176ec362a00c7a9123f6244a44fdbe2ad2 Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz@gmail.com>
Date: Wed, 30 Aug 2023 17:21:09 +0200
Subject: [PATCH 123/230] Add `bytes_per_second` to hash_partition benchmark
 (#13965)

Adds `bytes_per_second` to the `PARTITION_BENCH` benchmark.

This patch relates to #13735.

Authors:
  - Martin Marenz (https://github.com/Blonck)
  - Mark Harris (https://github.com/harrism)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/13965
---
 cpp/benchmarks/hashing/partition.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/benchmarks/hashing/partition.cpp b/cpp/benchmarks/hashing/partition.cpp
index b688fe2ed7f..0bec4394216 100644
--- a/cpp/benchmarks/hashing/partition.cpp
+++ b/cpp/benchmarks/hashing/partition.cpp
@@ -43,6 +43,13 @@ void BM_hash_partition(benchmark::State& state)
     cuda_event_timer timer(state, true);
     auto output = cudf::hash_partition(input, columns_to_hash, num_partitions);
   }
+
+  auto const bytes_read      = num_rows * num_cols * sizeof(T);
+  auto const bytes_written   = num_rows * num_cols * sizeof(T);
+  auto const partition_bytes = num_partitions * sizeof(cudf::size_type);
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_read + bytes_written + partition_bytes));
 }
 
 BENCHMARK_DEFINE_F(Hashing, hash_partition)

From ed754da1b6711927622ef9544f52aa1c9ce22191 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 30 Aug 2023 11:22:14 -0400
Subject: [PATCH 124/230] Add tab as literal to cudf::test::to_string output
 (#13993)

Adds escaped `\\t` to the `cudf::test::to_string()` output.
Found this while working on #13891 where the output included tabs but was shown as a various number of spaces in the console when using `cudf::test::print()`.
Also added `\\b` for good measure as well as a gtest for all the supported escape sequences.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/13993
---
 cpp/tests/utilities/column_utilities.cu              | 7 ++++++-
 cpp/tests/utilities_tests/column_utilities_tests.cpp | 8 ++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index fcaf23fd456..bae402155e9 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -1091,7 +1091,7 @@ struct column_view_printer {
     if (col.is_empty()) return;
     auto h_data = cudf::test::to_host<std::string>(col);
 
-    // explicitly replace '\r' and '\n' characters with "\r" and "\n" strings respectively.
+    // explicitly replace some special whitespace characters with their literal equivalents
     auto cleaned = [](std::string_view in) {
       std::string out(in);
       auto replace_char = [](std::string& out, char c, std::string_view repl) {
@@ -1099,8 +1099,13 @@ struct column_view_printer {
           out.replace(pos, 1, repl);
         }
       };
+      replace_char(out, '\a', "\\a");
+      replace_char(out, '\b', "\\b");
+      replace_char(out, '\f', "\\f");
       replace_char(out, '\r', "\\r");
+      replace_char(out, '\t', "\\t");
       replace_char(out, '\n', "\\n");
+      replace_char(out, '\v', "\\v");
       return out;
     };
 
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index e90a3f9ac6e..90a7270cb29 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -274,6 +274,14 @@ TEST_F(ColumnUtilitiesStringsTest, StringsToString)
   EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str());
 }
 
+TEST_F(ColumnUtilitiesStringsTest, PrintEscapeStrings)
+{
+  char const* delimiter = ",";
+  cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"});
+  std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"};
+  EXPECT_EQ(cudf::test::to_string(input, delimiter), expected);
+}
+
 TYPED_TEST(ColumnUtilitiesTestFixedPoint, NonNullableToHost)
 {
   using namespace numeric;

From 2b5e0fb587cf0cb479b470af5aa6c67ce4e7f00f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 30 Aug 2023 12:51:28 -0400
Subject: [PATCH 125/230] Improve performance of nvtext::edit_distance (#13912)

Improves performance of `nvtext::edit_distance` by reworking the algorithm with shorter working buffer and simpler logic.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/13912
---
 cpp/benchmarks/CMakeLists.txt         |   4 +-
 cpp/benchmarks/text/edit_distance.cpp |  58 ++++++++
 cpp/src/text/edit_distance.cu         | 194 +++++++++++++-------------
 3 files changed, 154 insertions(+), 102 deletions(-)
 create mode 100644 cpp/benchmarks/text/edit_distance.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 96e24efac8a..5e7862f4b3b 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -276,8 +276,8 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c
 ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp text/normalize.cpp
-  text/replace.cpp text/tokenize.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp
new file mode 100644
index 00000000000..8a8bd9ae586
--- /dev/null
+++ b/cpp/benchmarks/text/edit_distance.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/edit_distance.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <rmm/device_buffer.hpp>
+
+static void bench_edit_distance(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const strings_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const strings_table = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+  cudf::strings_column_view input1(strings_table->view().column(0));
+  cudf::strings_column_view input2(strings_table->view().column(1));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  auto chars_size = input1.chars_size() + input2.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  // output are integers (one per row)
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(input1, input2); });
+}
+
+NVBENCH_BENCH(bench_edit_distance)
+  .set_name("edit_distance")
+  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
+  .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256});
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index fb0ecdb7677..1460be4fcf5 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,11 +29,13 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
 
@@ -42,91 +44,76 @@ namespace detail {
 namespace {
 
 /**
- * @brief Compute the edit-distance between two strings
+ * @brief Compute the Levenshtein distance for each string pair
  *
- * The temporary buffer must be able to hold 3 int16 values for each character
- * in the smaller of the two provided strings.
+ * Documentation here: https://www.cuelogic.com/blog/the-levenshtein-algorithm
+ * And here: https://en.wikipedia.org/wiki/Levenshtein_distance
  *
  * @param d_str First string
  * @param d_tgt Second string
- * @param buffer Temporary memory buffer used for the calculation.
- * @return Edit distance value
+ * @param buffer Working buffer for intermediate calculations
+ * @return The edit distance value
  */
-__device__ int32_t compute_distance(cudf::string_view const& d_str,
-                                    cudf::string_view const& d_tgt,
-                                    int16_t* buffer)
+__device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
+                                            cudf::string_view const& d_tgt,
+                                            cudf::size_type* buffer)
 {
   auto const str_length = d_str.length();
   auto const tgt_length = d_tgt.length();
   if (str_length == 0) return tgt_length;
   if (tgt_length == 0) return str_length;
 
-  auto itr_A = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
-  auto itr_B = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
+  auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
+  auto itr   = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
   // .first is min and .second is max
-  auto const lengths = std::minmax(str_length, tgt_length);
+  auto const [n, m] = std::minmax(str_length, tgt_length);
   // setup compute buffer pointers
-  auto line2 = buffer;
-  auto line1 = line2 + lengths.first;
-  auto line0 = line1 + lengths.first;
-  // range is both lengths
-  auto const range = lengths.first + lengths.second - 1;
-  for (cudf::size_type i = 0; i < range; ++i) {
-    auto tmp = line2;
-    line2    = line1;
-    line1    = line0;
-    line0    = tmp;
-    // checking pairs of characters
-    for (int x = (i < lengths.second ? 0 : i - lengths.second + 1);
-         (x < lengths.first) && (x < i + 1);
-         ++x) {
-      int const y = i - x;
-      itr_A += (x - itr_A.position());  // point to next
-      itr_B += (y - itr_B.position());  // characters to check
-      int16_t const w =
-        (((x > 0) && (y > 0)) ? line2[x - 1] : static_cast<int16_t>(std::max(x, y))) +
-        static_cast<int16_t>(*itr_A != *itr_B);  // add 1 if characters do not match
-      int16_t const u = (y > 0 ? line1[x] : x + 1) + 1;
-      int16_t const v = (x > 0 ? line1[x - 1] : y + 1) + 1;
-      // store min(u,v,w)
-      line0[x] = std::min(std::min(u, v), w);
+  auto v0 = buffer;
+  auto v1 = v0 + n + 1;
+  // initialize v0
+  thrust::sequence(thrust::seq, v0, v1);
+
+  for (int i = 0; i < m; ++i, ++itr) {
+    auto itr_tgt = begin;
+    v1[0]        = i + 1;
+    for (int j = 0; j < n; ++j, ++itr_tgt) {
+      auto sub_cost = v0[j] + (*itr != *itr_tgt);
+      auto del_cost = v0[j + 1] + 1;
+      auto ins_cost = v1[j] + 1;
+      v1[j + 1]     = std::min(std::min(sub_cost, del_cost), ins_cost);
     }
+    thrust::swap(v0, v1);
   }
-  return static_cast<int32_t>(line0[lengths.first - 1]);
+  return v0[n];
 }
 
-/**
- * @brief Compute the Levenshtein distance for each string.
- *
- * Documentation here: https://www.cuelogic.com/blog/the-levenshtein-algorithm
- * And here: https://en.wikipedia.org/wiki/Levenshtein_distances
- */
 struct edit_distance_levenshtein_algorithm {
   cudf::column_device_view d_strings;  // computing these
   cudf::column_device_view d_targets;  // against these;
-  int16_t* d_buffer;                   // compute buffer for each string
-  int32_t* d_results;                  // input is buffer offset; output is edit distance
+  cudf::size_type* d_buffer;           // compute buffer for each string
+  std::ptrdiff_t const* d_offsets;     // locate sub-buffer for each string
+  cudf::size_type* d_results;          // edit distance values
 
-  __device__ void operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     auto d_str =
       d_strings.is_null(idx) ? cudf::string_view{} : d_strings.element<cudf::string_view>(idx);
     auto d_tgt = [&] __device__ {  // d_targets is also allowed to have only one entry
-      if (d_targets.is_null(idx)) return cudf::string_view{};
+      if (d_targets.is_null(idx)) { return cudf::string_view{}; }
       return d_targets.size() == 1 ? d_targets.element<cudf::string_view>(0)
                                    : d_targets.element<cudf::string_view>(idx);
     }();
-    d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_results[idx]);
+    d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_offsets[idx]);
   }
 };
 
 struct edit_distance_matrix_levenshtein_algorithm {
   cudf::column_device_view d_strings;  // computing these against itself
-  int16_t* d_buffer;                   // compute buffer for each string
-  int32_t const* d_offsets;            // locate sub-buffer for each string
-  int32_t* d_results;                  // edit distance values
+  cudf::size_type* d_buffer;           // compute buffer for each string
+  std::ptrdiff_t const* d_offsets;     // locate sub-buffer for each string
+  cudf::size_type* d_results;          // edit distance values
 
-  __device__ void operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     auto const strings_count = d_strings.size();
     auto const row           = idx / strings_count;
@@ -136,9 +123,9 @@ struct edit_distance_matrix_levenshtein_algorithm {
       d_strings.is_null(row) ? cudf::string_view{} : d_strings.element<cudf::string_view>(row);
     cudf::string_view d_str2 =
       d_strings.is_null(col) ? cudf::string_view{} : d_strings.element<cudf::string_view>(col);
-    auto work_buffer       = d_buffer + d_offsets[idx - ((row + 1) * (row + 2)) / 2];
-    int32_t const distance = (row == col) ? 0 : compute_distance(d_str1, d_str2, work_buffer);
-    d_results[idx]         = distance;                // top half of matrix
+    auto work_buffer    = d_buffer + d_offsets[idx - ((row + 1) * (row + 2)) / 2];
+    auto const distance = (row == col) ? 0 : compute_distance(d_str1, d_str2, work_buffer);
+    d_results[idx]      = distance;                   // top half of matrix
     d_results[col * strings_count + row] = distance;  // bottom half of matrix
   }
 };
@@ -153,10 +140,13 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
-  cudf::size_type strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
-  if (targets.size() > 1)
+  auto const strings_count = strings.size();
+  if (strings_count == 0) {
+    return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  }
+  if (targets.size() > 1) {
     CUDF_EXPECTS(strings_count == targets.size(), "targets.size() must equal strings.size()");
+  }
 
   // create device columns from the input columns
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
@@ -165,46 +155,46 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
   auto d_targets      = *targets_column;
 
   // calculate the size of the compute-buffer;
-  // we can use the output column buffer to hold the size/offset values temporarily
-  auto results   = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
-                                               strings_count,
-                                               rmm::device_buffer{0, stream, mr},
-                                               0,
-                                               stream,
-                                               mr);
-  auto d_results = results->mutable_view().data<int32_t>();
-
+  rmm::device_uvector<std::ptrdiff_t> offsets(strings_count, stream);
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings_count),
-                    d_results,
+                    offsets.begin(),
                     [d_strings, d_targets] __device__(auto idx) {
-                      if (d_strings.is_null(idx) || d_targets.is_null(idx)) return int32_t{0};
+                      if (d_strings.is_null(idx) || d_targets.is_null(idx)) {
+                        return cudf::size_type{0};
+                      }
                       auto d_str = d_strings.element<cudf::string_view>(idx);
                       auto d_tgt = d_targets.size() == 1
                                      ? d_targets.element<cudf::string_view>(0)
                                      : d_targets.element<cudf::string_view>(idx);
-                      // just need 3 int16's for each character of the shorter string
-                      return static_cast<int32_t>(std::min(d_str.length(), d_tgt.length()) * 3);
+                      // just need 2 integers for each character of the shorter string
+                      return (std::min(d_str.length(), d_tgt.length()) + 1) * 2;
                     });
 
   // get the total size of the temporary compute buffer
-  size_t compute_size =
-    thrust::reduce(rmm::exec_policy(stream), d_results, d_results + strings_count, size_t{0});
+  int64_t compute_size =
+    thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), int64_t{0});
   // convert sizes to offsets in-place
-  thrust::exclusive_scan(rmm::exec_policy(stream), d_results, d_results + strings_count, d_results);
+  thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
   // create the temporary compute buffer
-  rmm::device_uvector<int16_t> compute_buffer(compute_size, stream);
+  rmm::device_uvector<cudf::size_type> compute_buffer(compute_size, stream);
   auto d_buffer = compute_buffer.data();
 
-  // compute the edit distance into the output column in-place
-  // - on input, d_results is the offset to the working section of d_buffer for each row
-  // - on output, d_results is the calculated edit distance for that row
+  auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                                               strings_count,
+                                               rmm::device_buffer{0, stream, mr},
+                                               0,
+                                               stream,
+                                               mr);
+  auto d_results = results->mutable_view().data<cudf::size_type>();
+
+  // compute the edit distance into the output column
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
     strings_count,
-    edit_distance_levenshtein_algorithm{d_strings, d_targets, d_buffer, d_results});
+    edit_distance_levenshtein_algorithm{d_strings, d_targets, d_buffer, offsets.data(), d_results});
   return results;
 }
 
@@ -216,7 +206,9 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
                                                    rmm::mr::device_memory_resource* mr)
 {
   cudf::size_type strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+  if (strings_count == 0) {
+    return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  }
   CUDF_EXPECTS(strings_count > 1, "the input strings must include at least 2 strings");
   CUDF_EXPECTS(static_cast<size_t>(strings_count) * static_cast<size_t>(strings_count) <
                  static_cast<std::size_t>(std::numeric_limits<cudf::size_type>().max()),
@@ -230,7 +222,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
   // We only need memory for half the size of the output matrix since the edit distance calculation
   // is commutative -- `distance(strings[i],strings[j]) == distance(strings[j],strings[i])`
   cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2;
-  rmm::device_uvector<int32_t> offsets(n_upper, stream);
+  rmm::device_uvector<std::ptrdiff_t> offsets(n_upper, stream);
   auto d_offsets = offsets.data();
   CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
   thrust::for_each_n(
@@ -245,28 +237,29 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
         d_strings.is_null(row) ? cudf::string_view{} : d_strings.element<cudf::string_view>(row);
       cudf::string_view const d_str2 =
         d_strings.is_null(col) ? cudf::string_view{} : d_strings.element<cudf::string_view>(col);
-      if (d_str1.empty() || d_str2.empty()) return;
-      // the temp size needed is 3 int16s per character of the shorter string
-      d_offsets[idx - ((row + 1) * (row + 2)) / 2] = std::min(d_str1.length(), d_str2.length()) * 3;
+      if (d_str1.empty() || d_str2.empty()) { return; }
+      // the temp size needed is 2 integers per character of the shorter string
+      d_offsets[idx - ((row + 1) * (row + 2)) / 2] =
+        (std::min(d_str1.length(), d_str2.length()) + 1) * 2;
     });
 
   // get the total size for the compute buffer
-  size_t compute_size =
-    thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), size_t{0});
+  int64_t compute_size =
+    thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), int64_t{0});
   // convert sizes to offsets in-place
   thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
   // create the compute buffer
-  rmm::device_uvector<int16_t> compute_buffer(compute_size, stream);
+  rmm::device_uvector<cudf::size_type> compute_buffer(compute_size, stream);
   auto d_buffer = compute_buffer.data();
 
   // compute the edit distance into the output column
-  auto results   = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
+  auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                                strings_count * strings_count,
                                                rmm::device_buffer{0, stream, mr},
                                                0,
                                                stream,
                                                mr);
-  auto d_results = results->mutable_view().data<int32_t>();
+  auto d_results = results->mutable_view().data<cudf::size_type>();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -274,20 +267,21 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
     edit_distance_matrix_levenshtein_algorithm{d_strings, d_buffer, d_offsets, d_results});
 
   // build a lists column of the results
-  auto offsets_column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
-                                                      strings_count + 1,
-                                                      rmm::device_buffer{0, stream, mr},
-                                                      0,
-                                                      stream,
-                                                      mr);
+  auto offsets_column =
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                                  strings_count + 1,
+                                  rmm::device_buffer{0, stream, mr},
+                                  0,
+                                  stream,
+                                  mr);
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream),
-    thrust::make_counting_iterator<int32_t>(0),
-    thrust::make_counting_iterator<int32_t>(strings_count + 1),
-    offsets_column->mutable_view().data<int32_t>(),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(strings_count + 1),
+    offsets_column->mutable_view().data<cudf::size_type>(),
     [strings_count] __device__(auto idx) { return strings_count; },
-    int32_t{0},
-    thrust::plus<int32_t>());
+    cudf::size_type{0},
+    thrust::plus<cudf::size_type>());
   return cudf::make_lists_column(strings_count,
                                  std::move(offsets_column),
                                  std::move(results),

From f999e1c5ed183253585606fdfc7552a224aee2d7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 30 Aug 2023 12:34:00 -0500
Subject: [PATCH 126/230] Fix an issue where casting null-array to `object`
 dtype will result in a failure (#13994)

closes #13992

This PR fixes the construction of an empty `MultiIndex` from `pandas` to `cudf` was causing an error in pandas-compatibility mode, null-array is one such case where it is _okay_ to cast to any type because there is no data in it. Hence we pass `str` dtype to `astype`, whenever we encounter an `object` dtype.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13994
---
 python/cudf/cudf/core/column/column.py    | 17 +++++++++++++----
 python/cudf/cudf/tests/test_multiindex.py |  7 +++++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b5332f35073..d60f426c642 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1996,18 +1996,27 @@ def as_column(
         col = ColumnBase.from_arrow(arbitrary)
 
         if isinstance(arbitrary, pa.NullArray):
-            new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
             if dtype is not None:
                 # Cast the column to the `dtype` if specified.
-                col = col.astype(dtype)
+                new_dtype = dtype
             elif len(arbitrary) == 0:
                 # If the column is empty, it has to be
                 # a `float64` dtype.
-                col = col.astype("float64")
+                new_dtype = cudf.dtype("float64")
             else:
                 # If the null column is not empty, it has to
                 # be of `object` dtype.
-                col = col.astype(new_dtype)
+                new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
+
+            if cudf.get_option(
+                "mode.pandas_compatible"
+            ) and new_dtype == cudf.dtype("O"):
+                # We internally raise if we do `astype("object")`, hence
+                # need to cast to `str` since this is safe to do so because
+                # it is a null-array.
+                new_dtype = "str"
+
+            col = col.astype(new_dtype)
 
         return col
 
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 464b9623bad..bc9cf20b711 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1889,3 +1889,10 @@ def test_multiindex_levels():
 
     assert_eq(gidx.levels[0], pidx.levels[0])
     assert_eq(gidx.levels[1], pidx.levels[1])
+
+
+def test_multiindex_empty_slice_pandas_compatibility():
+    expected = pd.MultiIndex.from_tuples([("a", "b")])[:0]
+    with cudf.option_context("mode.pandas_compatible", True):
+        actual = cudf.from_pandas(expected)
+    assert_eq(expected, actual, exact=False)

From 8978a2163baa4d70effd9decf4cdd689705b42b5 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 30 Aug 2023 13:59:46 -0700
Subject: [PATCH 127/230] Create table_input_metadata from a table_metadata
 (#13920)

When round-tripping data through cuDF (e.g. read a parquet file with `read_parquet()`, then write slices using the `chunked_parquet_writer`) column nullability information can be lost. The parquet writers will accept a `table_input_metadata` object as an optional parameter, and this object can be used to preserve the nullability. Creating the `table_input_metadata` can be a challenge, however. This PR addresses this problem by adding the ability to create a `table_input_metadata` using the `table_metadata` returned by `read_parquet()`.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/13920
---
 cpp/include/cudf/io/types.hpp          | 25 ++++++++--
 cpp/src/io/functions.cpp               | 20 ++++++++
 cpp/src/io/parquet/reader_impl.cpp     |  5 +-
 cpp/src/io/utilities/column_buffer.cpp |  5 +-
 cpp/tests/io/parquet_test.cpp          | 64 ++++++++++++++++++++++++++
 5 files changed, 112 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 9b0dcff99af..a97f81182ac 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -201,20 +201,27 @@ enum dictionary_policy {
 };
 
 /**
- * @brief Detailed name information for output columns.
+ * @brief Detailed name (and optionally nullability) information for output columns.
  *
  * The hierarchy of children matches the hierarchy of children in the output
  * cudf columns.
  */
 struct column_name_info {
   std::string name;                        ///< Column name
+  std::optional<bool> is_nullable;         ///< Column nullability
   std::vector<column_name_info> children;  ///< Child column names
+
   /**
-   * @brief Construct a column name info with a name and no children
+   * @brief Construct a column name info with a name, optional nullabilty, and no children
    *
    * @param _name Column name
+   * @param _is_nullable True if column is nullable
    */
-  column_name_info(std::string const& _name) : name(_name) {}
+  column_name_info(std::string const& _name, std::optional<bool> _is_nullable = std::nullopt)
+    : name(_name), is_nullable(_is_nullable)
+  {
+  }
+
   column_name_info() = default;
 };
 
@@ -798,7 +805,17 @@ class table_input_metadata {
    *
    * @param table The table_view to construct metadata for
    */
-  table_input_metadata(table_view const& table);
+  explicit table_input_metadata(table_view const& table);
+
+  /**
+   * @brief Construct a new table_input_metadata from a table_metadata object.
+   *
+   * The constructed table_input_metadata has the same structure, column names and nullability as
+   * the passed table_metadata.
+   *
+   * @param metadata The table_metadata to construct table_intput_metadata for
+   */
+  explicit table_input_metadata(table_metadata const& metadata);
 
   std::vector<column_in_metadata> column_metadata;  //!< List of column metadata
 };
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 5adb2046dbd..45f8b0f8822 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -517,6 +517,26 @@ table_input_metadata::table_input_metadata(table_view const& table)
     table.begin(), table.end(), std::back_inserter(this->column_metadata), get_children);
 }
 
+table_input_metadata::table_input_metadata(table_metadata const& metadata)
+{
+  auto const& names = metadata.schema_info;
+
+  // Create a metadata hierarchy with naming and nullability using `table_metadata`
+  std::function<column_in_metadata(column_name_info const&)> process_node =
+    [&](column_name_info const& name) {
+      auto col_meta = column_in_metadata{name.name};
+      if (name.is_nullable.has_value()) { col_meta.set_nullability(name.is_nullable.value()); }
+      std::transform(name.children.begin(),
+                     name.children.end(),
+                     std::back_inserter(col_meta.children),
+                     process_node);
+      return col_meta;
+    };
+
+  std::transform(
+    names.begin(), names.end(), std::back_inserter(this->column_metadata), process_node);
+}
+
 /**
  * @copydoc cudf::io::write_parquet
  */
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 9289ddb91b3..8a73c43be3e 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -366,8 +366,9 @@ void reader::impl::populate_metadata(table_metadata& out_metadata)
   // Return column names
   out_metadata.schema_info.resize(_output_buffers.size());
   for (size_t i = 0; i < _output_column_schemas.size(); i++) {
-    auto const& schema               = _metadata->get_schema(_output_column_schemas[i]);
-    out_metadata.schema_info[i].name = schema.name;
+    auto const& schema                      = _metadata->get_schema(_output_column_schemas[i]);
+    out_metadata.schema_info[i].name        = schema.name;
+    out_metadata.schema_info[i].is_nullable = schema.repetition_type != REQUIRED;
   }
 
   // Return user metadata
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 3248d94d60a..f3a43cbc63c 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -149,7 +149,10 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream)
 {
-  if (schema_info != nullptr) { schema_info->name = buffer.name; }
+  if (schema_info != nullptr) {
+    schema_info->name        = buffer.name;
+    schema_info->is_nullable = buffer.is_nullable;
+  }
 
   switch (buffer.type.id()) {
     case type_id::STRING:
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index b210452f619..3cd5c9f5593 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -6599,6 +6599,70 @@ TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_F(ParquetWriterTest, PreserveNullability)
+{
+  constexpr auto num_rows = 100;
+
+  auto const col0_data = random_values<int32_t>(num_rows);
+  auto const col1_data = random_values<int32_t>(num_rows);
+
+  auto const col0_validity = cudf::test::iterators::no_nulls();
+  auto const col1_validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+
+  column_wrapper<int32_t> col0{col0_data.begin(), col0_data.end(), col0_validity};
+  column_wrapper<int32_t> col1{col1_data.begin(), col1_data.end(), col1_validity};
+  auto const col2 = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);
+
+  auto const expected = table_view{{col0, col1, *col2}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("mandatory");
+  expected_metadata.column_metadata[0].set_nullability(false);
+  expected_metadata.column_metadata[1].set_name("optional");
+  expected_metadata.column_metadata[1].set_nullability(true);
+  expected_metadata.column_metadata[2].set_name("lists");
+  expected_metadata.column_metadata[2].set_nullability(true);
+  // offsets is a cudf thing that's not part of the parquet schema so it won't have nullability set
+  expected_metadata.column_metadata[2].child(0).set_name("offsets");
+  expected_metadata.column_metadata[2].child(1).set_name("element");
+  expected_metadata.column_metadata[2].child(1).set_nullability(false);
+  expected_metadata.column_metadata[2].child(1).child(0).set_name("offsets");
+  expected_metadata.column_metadata[2].child(1).child(1).set_name("element");
+  expected_metadata.column_metadata[2].child(1).child(1).set_nullability(true);
+
+  auto const filepath = temp_env->get_temp_filepath("PreserveNullability.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options const in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result        = cudf::io::read_parquet(in_opts);
+  auto const read_metadata = cudf::io::table_input_metadata{result.metadata};
+
+  // test that expected_metadata matches read_metadata
+  std::function<void(cudf::io::column_in_metadata, cudf::io::column_in_metadata)>
+    compare_names_and_nullability = [&](auto lhs, auto rhs) {
+      EXPECT_EQ(lhs.get_name(), rhs.get_name());
+      ASSERT_EQ(lhs.is_nullability_defined(), rhs.is_nullability_defined());
+      if (lhs.is_nullability_defined()) { EXPECT_EQ(lhs.nullable(), rhs.nullable()); }
+      ASSERT_EQ(lhs.num_children(), rhs.num_children());
+      for (int i = 0; i < lhs.num_children(); ++i) {
+        compare_names_and_nullability(lhs.child(i), rhs.child(i));
+      }
+    };
+
+  ASSERT_EQ(expected_metadata.column_metadata.size(), read_metadata.column_metadata.size());
+
+  for (size_t i = 0; i < expected_metadata.column_metadata.size(); ++i) {
+    compare_names_and_nullability(expected_metadata.column_metadata[i],
+                                  read_metadata.column_metadata[i]);
+  }
+}
+
 TEST_P(ParquetV2Test, CheckEncodings)
 {
   using cudf::io::parquet::Encoding;

From 04ee729f583ffd44f73483f25a080d880c959f41 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 30 Aug 2023 16:05:35 -0500
Subject: [PATCH 128/230] Fix return type of `MultiIndex.difference` (#14009)

closes #14008

This PR ensures `MultiIndex.difference` returns `cudf.MultiIndex`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14009
---
 python/cudf/cudf/core/multiindex.py       | 2 +-
 python/cudf/cudf/tests/test_multiindex.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index eb953a54f6b..12da69740d8 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1637,7 +1637,7 @@ def memory_usage(self, deep=False):
     def difference(self, other, sort=None):
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
-        return self.to_pandas().difference(other, sort)
+        return cudf.from_pandas(self.to_pandas().difference(other, sort))
 
     @_cudf_nvtx_annotate
     def append(self, other):
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index bc9cf20b711..eedc9b0c174 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1697,6 +1697,7 @@ def test_difference():
 
     expected = midx2.to_pandas().difference(midx.to_pandas())
     actual = midx2.difference(midx)
+    assert isinstance(actual, cudf.MultiIndex)
     assert_eq(expected, actual)
 
 
From c1b79313f0aa6d1bcbef73a3a1a3471512ecfce8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 30 Aug 2023 17:28:40 -0500
Subject: [PATCH 129/230] Raise an error when timezone subtypes are encountered
 in `pd.IntervalDtype` (#14006)

closes #14004

This PR raises an error when an `IntervalIndex` contains a timezone-aware sub-type so that we don't go into infinite recursion.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14006
---
 python/cudf/cudf/core/column/column.py  |  8 ++++++--
 python/cudf/cudf/tests/test_interval.py | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d60f426c642..ad761ea8d18 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2261,8 +2261,12 @@ def as_column(
         data = ColumnBase.from_scalar(arbitrary, length if length else 1)
     elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
         data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype)
-    elif isinstance(arbitrary, pd.DatetimeIndex) and isinstance(
-        arbitrary.dtype, pd.DatetimeTZDtype
+    elif (
+        isinstance(arbitrary, pd.DatetimeIndex)
+        and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
+    ) or (
+        isinstance(arbitrary, pd.IntervalIndex)
+        and is_datetime64tz_dtype(arbitrary.dtype.subtype)
     ):
         raise NotImplementedError(
             "cuDF does not yet support timezone-aware datetimes"
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index f2e8f585a69..9704be44b95 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -165,3 +165,19 @@ def test_interval_index_unique():
     actual = gi.unique()
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("tz", ["US/Eastern", None])
+def test_interval_with_datetime(tz):
+    dti = pd.date_range(
+        start=pd.Timestamp("20180101", tz=tz),
+        end=pd.Timestamp("20181231", tz=tz),
+        freq="M",
+    )
+    pidx = pd.IntervalIndex.from_breaks(dti)
+    if tz is None:
+        gidx = cudf.from_pandas(pidx)
+        assert_eq(pidx, gidx)
+    else:
+        with pytest.raises(NotImplementedError):
+            cudf.from_pandas(pidx)

From c73ff70dc5ad85d71a0719606c688c2447d55d85 Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz@gmail.com>
Date: Thu, 31 Aug 2023 00:29:31 +0200
Subject: [PATCH 130/230] Enable fractional null probability for hashing
 benchmark (#13967)

In the past, the HASING_NVBENCH benchmark treated the `nulls` parameter as a boolean. Any value other than 0.0 resulted in a null probability of 1.0.

Now, the `nulls` parameter directly determines the null probability. For instance, a value of 0.1 will generate 10% of the data as null. Moreover, setting nulls to 0.0 produces data without a null bitmask.

Additionally, `bytes_per_second` are added to the benchmark.

This patch relates to #13735.

Authors:
  - Martin Marenz (https://github.com/Blonck)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13967
---
 cpp/benchmarks/hashing/hash.cpp | 35 +++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index f0e9202612e..e679b4b62d2 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -17,32 +17,59 @@
 #include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/hashing.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <nvbench/nvbench.cuh>
 
+#include <optional>
+
 static void bench_hash(nvbench::state& state)
 {
-  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const nulls     = static_cast<bool>(state.get_float64("nulls"));
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const nulls    = state.get_float64("nulls");
+  // disable null bitmask if probability is exactly 0.0
+  bool const no_nulls  = nulls == 0.0;
   auto const hash_name = state.get_string("hash_name");
 
-  data_profile const profile = data_profile_builder().null_probability(nulls);
-  auto const data            = create_random_table(
+  data_profile const profile =
+    data_profile_builder().null_probability(no_nulls ? std::nullopt : std::optional<double>{nulls});
+  auto const data = create_random_table(
     {cudf::type_id::INT64, cudf::type_id::STRING}, row_count{num_rows}, profile);
 
   auto stream = cudf::get_default_stream();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
 
+  // collect statistics
+  cudf::strings_column_view input(data->get_column(1).view());
+  auto const chars_size = input.chars_size();
+  // add memory read from string column
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  // add memory read from int64_t column
+  state.add_global_memory_reads<nvbench::int64_t>(num_rows);
+  // add memory read from bitmaks
+  if (!no_nulls) {
+    state.add_global_memory_reads<nvbench::int8_t>(2 *
+                                                   cudf::bitmask_allocation_size_bytes(num_rows));
+  }
+  // memory written depends on used hash
+
   if (hash_name == "murmurhash3_x86_32") {
+    state.add_global_memory_writes<nvbench::uint32_t>(num_rows);
+
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
       auto result = cudf::hashing::murmurhash3_x86_32(data->view());
     });
   } else if (hash_name == "md5") {
+    // md5 creates a 32-byte string
+    state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
+
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
   } else if (hash_name == "spark_murmurhash3_x86_32") {
+    state.add_global_memory_writes<nvbench::int32_t>(num_rows);
+
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
       auto result = cudf::hashing::spark_murmurhash3_x86_32(data->view());
     });

From a2fd6883977fb73027f36357d7114e12bb683296 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 31 Aug 2023 21:33:30 +0100
Subject: [PATCH 131/230] Explicitly depend on zlib in conda recipes (#14018)

We were previously obtaining zlib transitively through our cmake dependency, but since the 3.27.4 conda package, this dependency no longer exists. Therefore we must depend on zlib ourselves.

- Closes #14021

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14018
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-120_arch-x86_64.yaml | 1 +
 conda/recipes/libcudf/conda_build_config.yaml    | 2 ++
 conda/recipes/libcudf/meta.yaml                  | 1 +
 dependencies.yaml                                | 1 +
 5 files changed, 6 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e4a9b2f1d29..8965a43b8ac 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -95,6 +95,7 @@ dependencies:
 - tokenizers==0.13.1
 - transformers==4.24.0
 - typing_extensions>=4.0.0
+- zlib>=1.2.13
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index d03c4364435..4542eb79267 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -92,6 +92,7 @@ dependencies:
 - tokenizers==0.13.1
 - transformers==4.24.0
 - typing_extensions>=4.0.0
+- zlib>=1.2.13
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-120_arch-x86_64
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 0397045786b..25b3f19de77 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -40,6 +40,8 @@ spdlog_version:
 nvcomp_version:
   - "=2.6.1"
 
+zlib_version:
+  - ">=1.2.13"
 # The CTK libraries below are missing from the conda-forge::cudatoolkit package
 # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
 # and the "*_run_*" version specifiers correspond to `11.x` packages.
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index de32facba74..c844131ad31 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -73,6 +73,7 @@ requirements:
     - benchmark {{ gbench_version }}
     - gtest {{ gtest_version }}
     - gmock {{ gtest_version }}
+    - zlib {{ zlib_version }}
 
 outputs:
   - name: libcudf
diff --git a/dependencies.yaml b/dependencies.yaml
index a1d928797b0..97f86c6b864 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -179,6 +179,7 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - dlpack>=0.5,<0.6.0a0
+          - zlib>=1.2.13
     specific:
       - output_types: conda
         matrices:

From eeb761359f4eb5b15563177177c07b2ffa20cc4f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 31 Aug 2023 16:08:03 -0700
Subject: [PATCH 132/230] Simplify wheel build scripts and allow alphas of
 RAPIDS dependencies (#13963)

This PR makes a handful of changes aimed at simplifying the CI pipeline for building wheels as a precursor to switching RAPIDS nightlies to using proper alpha versions:
- Inlines apply_wheel_modifications.sh in build_wheel.sh. Now that the build doesn't rely excessively on logic in shared workflows, there's no real benefit to having a separate script (previously apply_wheel_modification.sh was a special script that the shared workflow knew to execute i.e. it was a hook into an externally controlled workflow).
- Consolidates the textual replacements using for loops and makes the replacements more targeted by only modifying the Python package being built in a given script. For instance, python/dask_cudf/pyproject.toml is no longer overwritten when building cudf.
- Modifies dependency specs for RAPIDS packages to include a `>=0.0.0a0` component. This is the key change that will allow alpha dependencies to be discovered.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/13963
---
 ci/build_wheel.sh                       | 55 +++++++++++++++++++++++++
 ci/build_wheel_cudf.sh                  | 24 ++++-------
 ci/build_wheel_dask_cudf.sh             | 19 ++-------
 ci/release/apply_wheel_modifications.sh | 32 --------------
 4 files changed, 65 insertions(+), 65 deletions(-)
 create mode 100755 ci/build_wheel.sh
 delete mode 100755 ci/release/apply_wheel_modifications.sh

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
new file mode 100755
index 00000000000..06d0c3c7a56
--- /dev/null
+++ b/ci/build_wheel.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name=$1
+package_dir=$2
+
+source rapids-configure-sccache
+source rapids-date-string
+
+# Use gha-tools rapids-pip-wheel-version to generate wheel version then
+# update the necessary files
+version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+# Patch project metadata files to include the CUDA version suffix and version override.
+pyproject_file="${package_dir}/pyproject.toml"
+
+sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file}
+sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+if [[ ${package_name} == "dask_cudf" ]]; then
+    sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+else
+    sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+    # ptxcompiler and cubinlinker aren't version constrained
+    sed -r -i "s/ptxcompiler\"/ptxcompiler${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+    sed -r -i "s/cubinlinker\"/cubinlinker${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+fi
+
+if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
+    sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
+    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
+    sed -i "/ptxcompiler/d" ${pyproject_file}
+    sed -i "/cubinlinker/d" ${pyproject_file}
+fi
+
+cd "${package_dir}"
+
+python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index c20a5162788..7d3919b2d72 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -3,24 +3,14 @@
 
 set -euo pipefail
 
-source rapids-configure-sccache
-source rapids-date-string
+package_dir="python/cudf"
 
-# Use gha-tools rapids-pip-wheel-version to generate wheel version then
-# update the necessary files
-version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
+export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
-ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}"
-echo "The package name and/or version was modified in the package source. The git diff is:"
-git diff
-
-cd python/cudf
+./ci/build_wheel.sh cudf ${package_dir}
 
-SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF" python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+mkdir -p ${package_dir}/final_dist
+python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
-mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
-
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
index 9af90d3a863..47e35c46004 100755
--- a/ci/build_wheel_dask_cudf.sh
+++ b/ci/build_wheel_dask_cudf.sh
@@ -3,22 +3,9 @@
 
 set -euo pipefail
 
-source rapids-configure-sccache
-source rapids-date-string
+package_dir="python/dask_cudf"
 
-# Use gha-tools rapids-pip-wheel-version to generate wheel version then
-# update the necessary files
-version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
+./ci/build_wheel.sh dask_cudf ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
-ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}"
-echo "The package name and/or version was modified in the package source. The git diff is:"
-git diff
-
-cd python/dask_cudf
-
-# Hardcode the output dir
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
-
-RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 dist
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
deleted file mode 100755
index 9d337aaa057..00000000000
--- a/ci/release/apply_wheel_modifications.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Usage: bash apply_wheel_modifications.sh <new_version> <cuda_suffix>
-
-VERSION=${1}
-CUDA_SUFFIX=${2}
-
-# pyproject.toml versions
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf/pyproject.toml
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/dask_cudf/pyproject.toml
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf_kafka/pyproject.toml
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/custreamz/pyproject.toml
-
-# cudf pyproject.toml cuda suffixes
-sed -i "s/^name = \"cudf\"/name = \"cudf${CUDA_SUFFIX}\"/g" python/cudf/pyproject.toml
-sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
-sed -i "s/ptxcompiler/ptxcompiler${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
-sed -i "s/cubinlinker/cubinlinker${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
-
-# dask_cudf pyproject.toml cuda suffixes
-sed -i "s/^name = \"dask_cudf\"/name = \"dask_cudf${CUDA_SUFFIX}\"/g" python/dask_cudf/pyproject.toml
-# Need to provide the == to avoid modifying the URL
-sed -i "s/\"cudf==/\"cudf${CUDA_SUFFIX}==/g" python/dask_cudf/pyproject.toml
-
-if [[ $CUDA_SUFFIX == "-cu12" ]]; then
-    sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" python/cudf/pyproject.toml
-    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/{cudf,dask_cudf}/pyproject.toml
-    sed -i "s/numba[<=>\.,0-9]*/numba>=0.57/g" python/{cudf,dask_cudf}/pyproject.toml
-    sed -i "/ptxcompiler/d" python/cudf/pyproject.toml
-    sed -i "/cubinlinker/d" python/cudf/pyproject.toml
-fi

From a8d3597ace2291d74325dd24c9cfa5126bb21847 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 31 Aug 2023 19:19:21 -0400
Subject: [PATCH 133/230] Add stream parameter to public cudf::strings::split
 APIs (#13997)

Adds a `stream` parameter to the libcudf `cudf:strings::split(), cudf:strings::rsplit(), cudf:strings::split_record(), cudf:strings::rsplit_record()` APIs.
Change needed for work in #13891

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - https://github.com/nvdbaranec
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13997
---
 cpp/include/cudf/strings/split/split.hpp | 56 ++++++++++++++----------
 cpp/src/strings/split/split.cu           |  6 ++-
 cpp/src/strings/split/split_record.cu    |  7 +--
 3 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index a6c942d39b4..701950e61a5 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,18 +43,20 @@ namespace strings {
  *
  * Any null string entries return corresponding null output columns.
  *
- * @param strings_column Strings instance for this operation.
- * @param delimiter UTF-8 encoded string indicating the split points in each string.
+ * @param strings_column Strings instance for this operation
+ * @param delimiter UTF-8 encoded string indicating the split points in each string;
  *        Default of empty string indicates split on whitespace.
- * @param maxsplit Maximum number of splits to perform.
+ * @param maxsplit Maximum number of splits to perform;
  *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned table's device memory.
- * @return New table of strings columns.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return New table of strings columns
  */
 std::unique_ptr<table> split(
   strings_column_view const& strings_column,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -71,18 +73,20 @@ std::unique_ptr<table> split(
  *
  * Any null string entries return corresponding null output columns.
  *
- * @param strings_column Strings instance for this operation.
- * @param delimiter UTF-8 encoded string indicating the split points in each string.
+ * @param strings_column Strings instance for this operation
+ * @param delimiter UTF-8 encoded string indicating the split points in each string;
  *        Default of empty string indicates split on whitespace.
- * @param maxsplit Maximum number of splits to perform.
+ * @param maxsplit Maximum number of splits to perform;
  *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned table's device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
  * @return New strings columns.
  */
 std::unique_ptr<table> rsplit(
   strings_column_view const& strings_column,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -141,20 +145,22 @@ std::unique_ptr<table> rsplit(
  *
  * @throw cudf:logic_error if `delimiter` is invalid.
  *
- * @param strings A column of string elements to be split.
- * @param delimiter The string to identify split points in each string.
+ * @param strings A column of string elements to be split
+ * @param delimiter The string to identify split points in each string;
  *        Default of empty string indicates split on whitespace.
- * @param maxsplit Maximum number of splits to perform.
- *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return Lists column of strings
- *         Each vector of the lists column holds splits from a single row
+ * @param maxsplit Maximum number of splits to perform;
+ *        Default of -1 indicates all possible splits on each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return Lists column of strings;
+ *         Each row of the lists column holds splits from a single row
  *         element of the input column.
  */
 std::unique_ptr<column> split_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -218,20 +224,22 @@ std::unique_ptr<column> split_record(
  *
  * @throw cudf:logic_error if `delimiter` is invalid.
  *
- * @param strings A column of string elements to be split.
- * @param delimiter The string to identify split points in each string.
+ * @param strings A column of string elements to be split
+ * @param delimiter The string to identify split points in each string;
  *        Default of empty string indicates split on whitespace.
- * @param maxsplit Maximum number of splits to perform.
- *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return Lists column of strings
- *         Each vector of the lists column holds splits from a single row
+ * @param maxsplit Maximum number of splits to perform;
+ *        Default of -1 indicates all possible splits on each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return Lists column of strings;
+ *         Each row of the lists column holds splits from a single row
  *         element of the input column.
  */
 std::unique_ptr<column> rsplit_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 56704a4a4b0..bad7eef4523 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -431,19 +431,21 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
 std::unique_ptr<table> split(strings_column_view const& strings_column,
                              string_scalar const& delimiter,
                              size_type maxsplit,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(strings_column, delimiter, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split(strings_column, delimiter, maxsplit, stream, mr);
 }
 
 std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               string_scalar const& delimiter,
                               size_type maxsplit,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit(strings_column, delimiter, maxsplit, cudf::get_default_stream(), mr);
+  return detail::rsplit(strings_column, delimiter, maxsplit, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index fab95f4f6d1..52f27c68111 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -203,21 +203,22 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
 std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record<detail::Direction::FORWARD>(
-    strings, delimiter, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split_record<detail::Direction::FORWARD>(strings, delimiter, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       string_scalar const& delimiter,
                                       size_type maxsplit,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Direction::BACKWARD>(
-    strings, delimiter, maxsplit, cudf::get_default_stream(), mr);
+    strings, delimiter, maxsplit, stream, mr);
 }
 
 }  // namespace strings

From ad9fa501192332ca8ce310ffe967473ec0945a97 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 31 Aug 2023 18:33:26 -0500
Subject: [PATCH 134/230] Preserve types of scalar being returned when possible
 in `quantile` (#14014)

closes #14002

This PR changes the behavior of the `quantile` API by preserving the return type of scalar `interpolation` is either one of `"lower", "higher", "nearest"`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14014
---
 python/cudf/cudf/core/column/numerical_base.py | 10 ++++++++++
 python/cudf/cudf/core/dataframe.py             | 13 ++++++++++---
 python/cudf/cudf/core/series.py                |  9 +++++++--
 python/cudf/cudf/tests/test_quantiles.py       | 15 +++++++++++++++
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 08c2f7cc7b1..e59d56af9dc 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -115,6 +115,16 @@ def quantile(
             result = self._numeric_quantile(q, interpolation, exact)
         if return_scalar:
             scalar_result = result.element_indexing(0)
+            if interpolation in {"lower", "higher", "nearest"}:
+                try:
+                    new_scalar = self.dtype.type(scalar_result)
+                    scalar_result = (
+                        new_scalar
+                        if new_scalar == scalar_result
+                        else scalar_result
+                    )
+                except (TypeError, ValueError):
+                    pass
             return (
                 cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
                 if scalar_result is NA
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3f89f78d278..e67604069f1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5487,16 +5487,23 @@ def quantile(
         numeric_only : bool, default True
             If False, the quantile of datetime and timedelta data will be
             computed as well.
-        interpolation : {`linear`, `lower`, `higher`, `midpoint`, `nearest`}
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
             This parameter specifies the interpolation method to use,
             when the desired quantile lies between two data points i and j.
-            Default is ``linear`` for ``method="single"``, and ``nearest``
+            Default is ``'linear'`` for ``method="single"``, and ``'nearest'``
             for ``method="table"``.
+
+                * linear: `i + (j - i) * fraction`, where `fraction` is the
+                  fractional part of the index surrounded by `i` and `j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
         columns : list of str
             List of column names to include.
         exact : boolean
             Whether to use approximate or exact quantile algorithm.
-        method : {`single`, `table`}, default `single`
+        method : {'single', 'table'}, default `'single'`
             Whether to compute quantiles per-column ('single') or over all
             columns ('table'). When 'table', the only allowed interpolation
             methods are 'nearest', 'lower', and 'higher'.
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 30d584c2270..2fef741ac09 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3132,8 +3132,13 @@ def quantile(
         interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
             This optional parameter specifies the interpolation method to use,
             when the desired quantile lies between two data points i and j:
-        columns : list of str
-            List of column names to include.
+
+                * linear: `i + (j - i) * fraction`, where `fraction` is the
+                  fractional part of the index surrounded by `i` and `j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
         exact : boolean
             Whether to use approximate or exact quantile algorithm.
         quant_index : boolean
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 53b06e64a91..8b126073a0f 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -75,3 +75,18 @@ def test_quantile_q_type():
         ),
     ):
         gs.quantile(cudf.DataFrame())
+
+
+@pytest.mark.parametrize(
+    "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"]
+)
+def test_quantile_type_int_float(interpolation):
+    data = [1, 3, 4]
+    psr = pd.Series(data)
+    gsr = cudf.Series(data)
+
+    expected = psr.quantile(0.5, interpolation=interpolation)
+    actual = gsr.quantile(0.5, interpolation=interpolation)
+
+    assert expected == actual
+    assert type(expected) == type(actual)

From 12fe7ee98901d51e8ee369b09ba2615b3a38dfbd Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 31 Aug 2023 21:58:10 -0500
Subject: [PATCH 135/230] Fix typo in docstring: metadata. (#14025)

Fix for a typo in a docstring for `contiguous_split`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14025
---
 cpp/include/cudf/contiguous_split.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 5fe4e738714..bf10f1fd489 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -28,7 +28,7 @@ namespace cudf {
  * @addtogroup column_copy
  * @{
  * @file
- * @brief Table APIs for contiguous_split, pack, unpack, and metadadata
+ * @brief Table APIs for contiguous_split, pack, unpack, and metadata
  */
 
 /**

From 27e433ad837e72c71acd37376c98b2e5aeb450ad Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 1 Sep 2023 00:02:07 -0500
Subject: [PATCH 136/230] Use grid_stride for stride computations. (#13996)

This PR adds `grid_1d::grid_stride()` and uses it in a handful of kernels. Follow-up to #13910, which added a `grid_1d::global_thread_id()`. We'll need to do a later PR that catches any missing instances where this should be used, since there are a large number of PRs in flight touching thread indexing code in various files. See #10368.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13996
---
 cpp/include/cudf/detail/utilities/cuda.cuh | 23 ++++++++++++++++++++++
 cpp/include/cudf/detail/valid_if.cuh       |  4 ++--
 cpp/src/bitmask/null_mask.cu               |  6 +++---
 cpp/src/copying/scatter.cu                 |  4 ++--
 cpp/src/partitioning/partitioning.cu       | 18 ++++++++---------
 cpp/src/replace/nulls.cu                   | 12 +++++------
 cpp/src/transform/compute_column.cu        |  5 ++---
 7 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index c95189f1f94..264302df0e9 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -92,6 +92,29 @@ class grid_1d {
   {
     return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
   }
+
+  /**
+   * @brief Returns the stride of a 1D grid.
+   *
+   * The returned stride is the total number of threads in the grid.
+   *
+   * @param thread_id The thread index within the block
+   * @param block_id The block index within the grid
+   * @param num_threads_per_block The number of threads per block
+   * @return thread_index_type The global thread index
+   */
+  static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
+                                                 thread_index_type num_blocks_per_grid)
+  {
+    return num_threads_per_block * num_blocks_per_grid;
+  }
+
+  /**
+   * @brief Returns the stride of the current 1D grid.
+   *
+   * @return thread_index_type The number of threads in the grid.
+   */
+  static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
 };
 
 /**
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index bed884a23eb..f3f95dad017 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -49,8 +49,8 @@ __global__ void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  thread_index_type i            = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto i            = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 33dc7e0556b..5a0d3e4f120 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -108,7 +108,7 @@ __global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
   thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
   bitmask_type fill_value           = valid ? 0xffff'ffff : 0;
 
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   for (thread_index_type destination_word_index = grid_1d::global_thread_id();
        destination_word_index < number_of_mask_words;
@@ -191,7 +191,7 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
                                     size_type source_end_bit,
                                     size_type number_of_mask_words)
 {
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto const stride = cudf::detail::grid_1d::grid_stride();
   for (thread_index_type destination_word_index = grid_1d::global_thread_id();
        destination_word_index < number_of_mask_words;
        destination_word_index += stride) {
@@ -265,7 +265,7 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask,
   auto const first_word_index{word_index(first_bit_index)};
   auto const last_word_index{word_index(last_bit_index)};
   thread_index_type const tid         = grid_1d::global_thread_id();
-  thread_index_type const stride      = blockDim.x * gridDim.x;
+  thread_index_type const stride      = grid_1d::grid_stride();
   thread_index_type thread_word_index = tid + first_word_index;
   size_type thread_count{0};
 
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 96e24e9059d..11c27fc86e3 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -52,8 +52,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
                                        MapIterator scatter_map,
                                        size_type num_scatter_rows)
 {
-  thread_index_type row          = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto row          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   while (row < num_scatter_rows) {
     size_type const output_row = scatter_map[row];
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index ff9c4ea2f59..7b6676346c2 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -134,8 +134,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   // Accumulate histogram of the size of each partition in shared memory
   extern __shared__ size_type shared_partition_sizes[];
 
-  auto tid = cudf::thread_index_type{threadIdx.x} +
-             cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Initialize local histogram
   size_type partition_number = threadIdx.x;
@@ -160,7 +160,7 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
     row_partition_offset[row_number] =
       atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1));
 
-    tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
+    tid += stride;
   }
 
   __syncthreads();
@@ -215,8 +215,8 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
   }
   __syncthreads();
 
-  auto tid = cudf::thread_index_type{threadIdx.x} +
-             cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Get each row's partition number, and get it's output location by
   // incrementing block's offset counter for that partition number
@@ -234,7 +234,7 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
     // Store the row's output location in-place
     row_partition_numbers[row_number] = row_output_location;
 
-    tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
+    tid += stride;
   }
 }
 
@@ -311,10 +311,8 @@ __global__ void copy_block_partitions(InputIter input_iter,
   __syncthreads();
 
   // Fetch the input data to shared memory
-  for (auto tid = cudf::thread_index_type{threadIdx.x} +
-                  cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x};
-       tid < num_rows;
-       tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}) {
+  for (auto tid = cudf::detail::grid_1d::global_thread_id(); tid < num_rows;
+       tid += cudf::detail::grid_1d::grid_stride()) {
     auto const row_number      = static_cast<size_type>(tid);
     size_type const ipartition = row_partition_numbers[row_number];
 
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index e033db0e52a..5b9fd3d9f0f 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -64,9 +64,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
                                       char* chars,
                                       cudf::size_type* valid_counter)
 {
-  cudf::size_type nrows                = input.size();
-  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
-  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
+  cudf::size_type nrows = input.size();
+  auto i                = cudf::detail::grid_1d::global_thread_id();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -117,9 +117,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
                               cudf::mutable_column_device_view output,
                               cudf::size_type* output_valid_count)
 {
-  cudf::size_type nrows                = input.size();
-  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
-  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
+  cudf::size_type nrows = input.size();
+  auto i                = cudf::detail::grid_1d::global_thread_id();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 61293d51ba2..224dd93b048 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -69,9 +69,8 @@ __launch_bounds__(max_block_size) __global__
 
   auto thread_intermediate_storage =
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-  auto const start_idx =
-    static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
-  auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
+  auto start_idx    = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
   auto evaluator =
     cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);
 

From b705c814b5fa44638e4168abcb070ece1d040b24 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 1 Sep 2023 08:41:31 -0500
Subject: [PATCH 137/230] Added pinned pool reservation API for java (#13964)

This adds in an API to java to reserve pinned memory. It is very much like allocating it, but provides a way to guarantee up front that there is enough memory for multiple allocations to succeed.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/13964
---
 .../ai/rapids/cudf/HostMemoryReservation.java |  32 ++++
 .../java/ai/rapids/cudf/PinnedMemoryPool.java | 158 ++++++++++++++++--
 2 files changed, 175 insertions(+), 15 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java

diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
new file mode 100644
index 00000000000..72c2e659372
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
@@ -0,0 +1,32 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Represents some amount of host memory that has been reserved. A reservation guarantees that one
+ * or more allocations up to the reserved amount, minus padding for alignment will succeed. A
+ * reservation typically guarantees the amount can be allocated one, meaning when a buffer
+ * allocated from a reservation is freed it is not returned to the reservation, but to the pool of
+ * memory the reservation originally came from. If more memory is allocated from the reservation
+ * an OutOfMemoryError may be thrown, but it is not guaranteed to happen.
+ *
+ * When the reservation is closed any unused reservation will be returned to the pool of memory
+ * the reservation came from.
+ */
+public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {}
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 969946a9533..9ce72ba237e 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -37,13 +37,14 @@
  */
 public final class PinnedMemoryPool implements AutoCloseable {
   private static final Logger log = LoggerFactory.getLogger(PinnedMemoryPool.class);
-  private static final long ALIGNMENT = 8;
+  private static final long ALIGNMENT = ColumnView.hostPaddingSizeInBytes();
 
   // These static fields should only ever be accessed when class-synchronized.
   // Do NOT use singleton_ directly!  Use the getSingleton accessor instead.
   private static volatile PinnedMemoryPool singleton_ = null;
   private static Future<PinnedMemoryPool> initFuture = null;
 
+  private final long totalPoolSize;
   private final long pinnedPoolBase;
   private final SortedSet<MemorySection> freeHeap = new TreeSet<>(new SortedByAddress());
   private int numAllocatedSections = 0;
@@ -164,6 +165,14 @@ private static void freeInternal(MemorySection section) {
     Objects.requireNonNull(getSingleton()).free(section);
   }
 
+  /**
+   * Used to indicate that memory was allocated from a reservation. This primarily is for
+   * keeping track of outstanding allocations.
+   */
+  private static void reserveAllocInternal(MemorySection section) {
+    Objects.requireNonNull(getSingleton()).reserveAllocHappened(section);
+  }
+
   /**
    * Initialize the pool.
    *
@@ -226,6 +235,21 @@ public static HostMemoryBuffer tryAllocate(long bytes) {
     return result;
   }
 
+  /**
+   * Factory method to create a pinned host memory reservation.
+   *
+   * @param bytes size in bytes to reserve
+   * @return newly created reservation or null if insufficient pinned memory to cover it.
+   */
+  public static HostMemoryReservation tryReserve(long bytes) {
+    HostMemoryReservation result = null;
+    PinnedMemoryPool pool = getSingleton();
+    if (pool != null) {
+      result = pool.tryReserveInternal(bytes);
+    }
+    return result;
+  }
+
   /**
    * Factory method to create a host buffer but preferably pointing to pinned memory.
    * It is not guaranteed that the returned buffer will be pointer to pinned memory.
@@ -233,7 +257,7 @@ public static HostMemoryBuffer tryAllocate(long bytes) {
    * @param bytes size in bytes to allocate
    * @return newly created buffer
    */
-  public static HostMemoryBuffer allocate(long bytes, HostMemoryAllocator  hostMemoryAllocator) {
+  public static HostMemoryBuffer allocate(long bytes, HostMemoryAllocator hostMemoryAllocator) {
     HostMemoryBuffer result = tryAllocate(bytes);
     if (result == null) {
       result = hostMemoryAllocator.allocate(bytes, false);
@@ -241,6 +265,13 @@ public static HostMemoryBuffer allocate(long bytes, HostMemoryAllocator  hostMem
     return result;
   }
 
+  /**
+   * Factory method to create a host buffer but preferably pointing to pinned memory.
+   * It is not guaranteed that the returned buffer will be pointer to pinned memory.
+   *
+   * @param bytes size in bytes to allocate
+   * @return newly created buffer
+   */
   public static HostMemoryBuffer allocate(long bytes) {
     return allocate(bytes, DefaultHostMemoryAllocator.get());
   }
@@ -258,12 +289,24 @@ public static long getAvailableBytes() {
     return 0;
   }
 
+  /**
+   * Get the number of bytes that the pinned memory pool was allocated with.
+   */
+  public static long getTotalPoolSizeBytes() {
+    PinnedMemoryPool pool = getSingleton();
+    if (pool != null) {
+      return pool.getTotalPoolSizeInternal();
+    }
+    return 0;
+  }
+
   private PinnedMemoryPool(long poolSize, int gpuId) {
     if (gpuId > -1) {
       // set the gpu device to use
       Cuda.setDevice(gpuId);
       Cuda.freeZero();
     }
+    this.totalPoolSize = poolSize;
     this.pinnedPoolBase = Cuda.hostAllocPinned(poolSize);
     freeHeap.add(new MemorySection(pinnedPoolBase, poolSize));
     this.availableBytes = poolSize;
@@ -271,32 +314,42 @@ private PinnedMemoryPool(long poolSize, int gpuId) {
 
   @Override
   public void close() {
-    assert numAllocatedSections == 0;
+    assert numAllocatedSections == 0 : "Leaked " + numAllocatedSections + " pinned allocations";
     Cuda.freePinned(pinnedPoolBase);
   }
 
-  private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
+  /**
+   * Pads a length of bytes to the alignment the CPU wants in the worst case. This helps to
+   * calculate the size needed for a reservation if there are multiple buffers.
+   * @param bytes the size in bytes
+   * @return the new padded size in bytes.
+   */
+  public static long padToCpuAlignment(long bytes) {
+    return  ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT;
+  }
+
+  private synchronized MemorySection tryGetInternal(long bytes, String what) {
     if (freeHeap.isEmpty()) {
       log.debug("No free pinned memory left");
       return null;
     }
     // Align the allocation
-    long alignedBytes = ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT;
+    long alignedBytes = padToCpuAlignment(bytes);
     Optional<MemorySection> firstFit = freeHeap.stream()
-        .filter(section -> section.size >= alignedBytes)
-        .findFirst();
+            .filter(section -> section.size >= alignedBytes)
+            .findFirst();
     if (!firstFit.isPresent()) {
       if (log.isDebugEnabled()) {
         MemorySection largest = freeHeap.stream()
-            .max(new SortedBySize())
-            .orElse(new MemorySection(0, 0));
+                .max(new SortedBySize())
+                .orElse(new MemorySection(0, 0));
         log.debug("Insufficient pinned memory. {} needed, {} found", alignedBytes, largest.size);
       }
       return null;
     }
     MemorySection first = firstFit.get();
-    log.debug("Allocating {}/{} bytes pinned from {} FREE COUNT {} OUTSTANDING COUNT {}",
-        bytes, alignedBytes, first, freeHeap.size(), numAllocatedSections);
+    log.debug("{} {}/{} bytes pinned from {} FREE COUNT {} OUTSTANDING COUNT {}",
+            what, bytes, alignedBytes, first, freeHeap.size(), numAllocatedSections);
     freeHeap.remove(first);
     MemorySection allocated;
     if (first.size == alignedBytes) {
@@ -307,9 +360,74 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
     }
     numAllocatedSections++;
     availableBytes -= allocated.size;
-    log.debug("Allocated {} free {} outstanding {}", allocated, freeHeap, numAllocatedSections);
-    return new HostMemoryBuffer(allocated.baseAddress, bytes,
-        new PinnedHostBufferCleaner(allocated, bytes));
+    log.debug("{} {} free {} outstanding {}", what, allocated, freeHeap, numAllocatedSections);
+    return allocated;
+  }
+
+  private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
+    MemorySection allocated = tryGetInternal(bytes, "allocate");
+    if (allocated == null) {
+      return null;
+    } else {
+      return new HostMemoryBuffer(allocated.baseAddress, bytes,
+              new PinnedHostBufferCleaner(allocated, bytes));
+    }
+  }
+
+  private class PinnedReservation implements HostMemoryReservation {
+    private MemorySection section = null;
+
+    public PinnedReservation(MemorySection section) {
+      this.section = section;
+    }
+
+    @Override
+    public synchronized HostMemoryBuffer allocate(long bytes, boolean preferPinned) {
+      return this.allocate(bytes);
+    }
+
+    @Override
+    public synchronized HostMemoryBuffer allocate(long bytes) {
+      if (section == null || section.size < bytes) {
+        throw new OutOfMemoryError("Reservation didn't have enough space " + bytes + " / " +
+                (section == null ? 0 : section.size));
+      }
+      long alignedSize = padToCpuAlignment(bytes);
+      MemorySection allocated;
+      if (section.size >= bytes && section.size <= alignedSize) {
+        allocated = section;
+        section = null;
+        // No need for reserveAllocInternal because the original section is already tracked
+      } else {
+        allocated = section.splitOff(alignedSize);
+        PinnedMemoryPool.reserveAllocInternal(allocated);
+      }
+      return new HostMemoryBuffer(allocated.baseAddress, bytes,
+              new PinnedHostBufferCleaner(allocated, bytes));
+    }
+
+    @Override
+    public synchronized void close() throws Exception {
+      if (section != null) {
+        try {
+          PinnedMemoryPool.freeInternal(section);
+        } finally {
+          // Always mark the resource as freed even if an exception is thrown.
+          // We cannot know how far it progressed before the exception, and
+          // therefore it is unsafe to retry.
+          section = null;
+        }
+      }
+    }
+  }
+
+  private HostMemoryReservation tryReserveInternal(long bytes) {
+    MemorySection allocated = tryGetInternal(bytes, "allocate");
+    if (allocated == null) {
+      return null;
+    } else {
+      return new PinnedReservation(allocated);
+    }
   }
 
   private synchronized void free(MemorySection section) {
@@ -328,7 +446,17 @@ private synchronized void free(MemorySection section) {
     log.debug("After freeing {} outstanding {}", freeHeap, numAllocatedSections);
   }
 
+  private synchronized void reserveAllocHappened(MemorySection section) {
+    if (section != null && section.size > 0) {
+      numAllocatedSections++;
+    }
+  }
+
   private synchronized long getAvailableBytesInternal() {
     return this.availableBytes;
   }
+
+  private long getTotalPoolSizeInternal() {
+    return this.totalPoolSize;
+  }
 }

From d1fb671128a55f965a7db907e99d5b1a841c2213 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 1 Sep 2023 16:16:47 +0100
Subject: [PATCH 138/230] Remove quadratic runtime due to accessing
 Frame._dtypes in loop (#14028)

Frame._dtypes maps column names to dtypes, however, it is a property that is computed on-demand. Consequently, a seemingly innocuous dict lookup is actually O(N). When used in a loop over columns, this makes an O(N) loop into an O(N^2) one.

This mostly bites on IO when reading data with many thousands of columns. To fix this, manually move access of Frame._dtypes outside of any loop over columns.

A more systematic way might be to make this a cached property, but the cache invalidation is rather hard to reason about.

- Closes https://github.com/rapidsai/cudf/issues/14005

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/14028
---
 python/cudf/cudf/core/groupby/groupby.py | 3 ++-
 python/cudf/cudf/core/indexed_frame.py   | 6 +-----
 python/cudf/cudf/io/csv.py               | 7 ++++---
 python/cudf/cudf/io/json.py              | 7 ++++---
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 38b07eca330..b300c55b537 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -313,9 +313,10 @@ def dtypes(self):
         3  object  int64
         """
         index = self.grouping.keys.unique().sort_values().to_pandas()
+        obj_dtypes = self.obj._dtypes
         return pd.DataFrame(
             {
-                name: [self.obj._dtypes[name]] * len(index)
+                name: [obj_dtypes[name]] * len(index)
                 for name in self.grouping.values._column_names
             },
             index=index,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 4c6eb3a50e9..33ac97d7ef8 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -822,11 +822,7 @@ def replace(
             ) = _get_replacement_values_for_columns(
                 to_replace=to_replace,
                 value=value,
-                # TODO: This should be replaced with `DataFrame._dtypes` once
-                # that is moved up to `Frame`.
-                columns_dtype_map={
-                    col: self._data[col].dtype for col in self._data
-                },
+                columns_dtype_map=self._dtypes,
             )
 
             for name, col in self._data.items():
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 95e0aa18070..bacc0641639 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -123,11 +123,12 @@ def read_csv(
     if dtype is None or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is None else dtype
+        specified_dtypes = {} if dtype is None else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 4de9a92a068..efac24aee17 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -158,11 +158,12 @@ def read_json(
     if dtype is True or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is True else dtype
+        specified_dtypes = {} if dtype is True else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}
 

From 2b7294b9afe413b8f6b956dc5148452ca0161e7f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 1 Sep 2023 16:20:45 -0700
Subject: [PATCH 139/230] Expose streams in public replace APIs (#14010)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14010
---
 cpp/include/cudf/replace.hpp              |  25 ++++-
 cpp/src/replace/clamp.cu                  |   6 +-
 cpp/src/replace/nans.cu                   |  15 +--
 cpp/src/replace/nulls.cu                  |   9 +-
 cpp/src/replace/replace.cu                |   4 +-
 cpp/tests/CMakeLists.txt                  |   1 +
 cpp/tests/replace/replace_nulls_tests.cpp |  23 ++---
 cpp/tests/replace/replace_tests.cpp       |  34 +++----
 cpp/tests/streams/replace_test.cpp        | 109 ++++++++++++++++++++++
 9 files changed, 181 insertions(+), 45 deletions(-)
 create mode 100644 cpp/tests/streams/replace_test.cpp

diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index 9df58306ace..3405dc8b796 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -45,6 +46,7 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING };
  *
  * @param[in] input A column whose null values will be replaced
  * @param[in] replacement A cudf::column whose values will replace null values in input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns A copy of `input` with the null values replaced with corresponding values from
@@ -53,6 +55,7 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING };
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   column_view const& replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -63,6 +66,7 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param[in] input A column whose null values will be replaced
  * @param[in] replacement Scalar used to replace null values in `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns Copy of `input` with null values replaced by `replacement`
@@ -70,6 +74,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -80,6 +85,7 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param[in] input A column whose null values will be replaced
  * @param[in] replace_policy Specify the position of replacement values relative to null values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns Copy of `input` with null values replaced based on `replace_policy`
@@ -87,6 +93,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   replace_policy const& replace_policy,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -106,6 +113,7 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param input A column whose NaN values will be replaced
  * @param replacement A cudf::column whose values will replace NaN values in input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A copy of `input` with the NaN values replaced with corresponding values from
  * `replacement`.
@@ -113,6 +121,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -132,12 +141,14 @@ std::unique_ptr<column> replace_nans(
  *
  * @param input A column whose NaN values will be replaced
  * @param replacement A cudf::scalar whose value will replace NaN values in input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A copy of `input` with the NaN values replaced by `replacement`
  */
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -147,6 +158,7 @@ std::unique_ptr<column> replace_nans(
  * @param input_col The column to find and replace values in
  * @param values_to_replace The values to replace
  * @param replacement_values The values to replace with
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns Copy of `input_col` with specified values replaced
@@ -155,6 +167,7 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& input_col,
   column_view const& values_to_replace,
   column_view const& replacement_values,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -198,6 +211,7 @@ std::unique_ptr<column> find_and_replace_all(
  * @param[in] hi Maximum clamp value. All elements greater than `hi` will be replaced by
  * `hi_replace`. Ignored if null.
  * @param[in] hi_replace All elements greater than `hi` will be replaced by `hi_replace`
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @return Returns a clamped column as per `lo` and `hi` boundaries
@@ -208,6 +222,7 @@ std::unique_ptr<column> clamp(
   scalar const& lo_replace,
   scalar const& hi,
   scalar const& hi_replace,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -244,6 +259,7 @@ std::unique_ptr<column> clamp(
  * if null.
  * @param[in] hi Maximum clamp value. All elements greater than `hi` will be replaced by `hi`
  * Ignored if null.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @return Returns a clamped column as per `lo` and `hi` boundaries
@@ -252,6 +268,7 @@ std::unique_ptr<column> clamp(
   column_view const& input,
   scalar const& lo,
   scalar const& hi,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -264,12 +281,14 @@ std::unique_ptr<column> clamp(
  *
  * @throws cudf::logic_error if column does not have floating point data type.
  * @param[in] input column_view of floating-point elements to copy and normalize
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr device_memory_resource allocator for allocating output data
  *
  * @returns new column with the modified data
  */
 std::unique_ptr<column> normalize_nans_and_zeros(
   column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -282,8 +301,10 @@ std::unique_ptr<column> normalize_nans_and_zeros(
  *
  * @throws cudf::logic_error if column does not have floating point data type.
  * @param[in, out] in_out of floating-point elements to normalize
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void normalize_nans_and_zeros(mutable_column_view& in_out);
+void normalize_nans_and_zeros(mutable_column_view& in_out,
+                              rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 68b496e0ab8..2b48aed2d29 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -386,19 +386,21 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& lo_replace,
                               scalar const& hi,
                               scalar const& hi_replace,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::clamp(input, lo, lo_replace, hi, hi_replace, cudf::get_default_stream(), mr);
+  return detail::clamp(input, lo, lo_replace, hi, hi_replace, stream, mr);
 }
 
 // clamp input at lo and hi
 std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& lo,
                               scalar const& hi,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::clamp(input, lo, lo, hi, hi, cudf::get_default_stream(), mr);
+  return detail::clamp(input, lo, lo, hi, hi, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index ce0d2d07b36..2fcb934ba65 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,18 +111,20 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nans(input, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_nans(input, replacement, stream, mr);
 }
 
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nans(input, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_nans(input, replacement, stream, mr);
 }
 
 }  // namespace cudf
@@ -202,7 +204,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
 
   // from device. unique_ptr which gets automatically cleaned up when we leave.
   auto out_view = out->mutable_view();
-  normalize_nans_and_zeros(out_view, stream);
+  detail::normalize_nans_and_zeros(out_view, stream);
   out->set_null_count(input.null_count());
 
   return out;
@@ -221,10 +223,11 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
+                                                 rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_nans_and_zeros(input, cudf::get_default_stream(), mr);
+  return detail::normalize_nans_and_zeros(input, stream, mr);
 }
 
 /**
@@ -237,7 +240,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
  * @throws cudf::logic_error if column does not have floating point data type.
  * @param[in, out] in_out mutable_column_view representing input data. data is processed in-place
  */
-void normalize_nans_and_zeros(mutable_column_view& in_out)
+void normalize_nans_and_zeros(mutable_column_view& in_out, rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   detail::normalize_nans_and_zeros(in_out, cudf::get_default_stream());
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 5b9fd3d9f0f..2eb624d3f05 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -446,26 +446,29 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_nulls(input, replacement, stream, mr);
 }
 
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_nulls(input, replacement, stream, mr);
 }
 
 std::unique_ptr<cudf::column> replace_nulls(column_view const& input,
                                             replace_policy const& replace_policy,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replace_policy, cudf::get_default_stream(), mr);
+  return detail::replace_nulls(input, replace_policy, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index a7847bc0e7f..07eefdc27c6 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -527,9 +527,9 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
 std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& input_col,
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
+                                                   rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
-  return detail::find_and_replace_all(
-    input_col, values_to_replace, replacement_values, cudf::get_default_stream(), mr);
+  return detail::find_and_replace_all(input_col, values_to_replace, replacement_values, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8a0aa27b175..1bb1987198d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -625,6 +625,7 @@ ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 7e84a0695e3..a7c54145708 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -46,8 +46,7 @@ TEST_F(ReplaceErrorTest, SizeMismatch)
                                                                {0, 0, 1, 1, 1, 1, 1, 1}};
   cudf::test::fixed_width_column_wrapper<int32_t> values_to_replace_column{{10, 11, 12, 13}};
 
-  ASSERT_THROW(cudf::replace_nulls(input_column, values_to_replace_column, mr()),
-               cudf::logic_error);
+  ASSERT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::logic_error);
 }
 
 // Error: column type mismatch
@@ -58,8 +57,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
   cudf::test::fixed_width_column_wrapper<float> values_to_replace_column{
     {10, 11, 12, 13, 14, 15, 16, 17}};
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column, mr()),
-               cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::logic_error);
 }
 
 // Error: column type mismatch
@@ -69,7 +67,7 @@ TEST_F(ReplaceErrorTest, TypeMismatchScalar)
                                                                {0, 0, 1, 1, 1, 1, 1, 1}};
   cudf::numeric_scalar<float> replacement(1);
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, replacement, mr()), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::logic_error);
 }
 
 struct ReplaceNullsStringsTest : public cudf::test::BaseFixture {};
@@ -88,7 +86,7 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplace)
     replacement.begin(), replacement.end(), replacement_v.begin()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
@@ -107,7 +105,7 @@ TEST_F(ReplaceNullsStringsTest, ReplaceWithNulls)
     replacement.begin(), replacement.end(), replacement_v.begin()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
@@ -125,7 +123,7 @@ TEST_F(ReplaceNullsStringsTest, ReplaceWithAllNulls)
   cudf::test::strings_column_wrapper expected_w{input.begin(), input.end(), input_v.begin()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
@@ -143,7 +141,7 @@ TEST_F(ReplaceNullsStringsTest, ReplaceWithAllEmpty)
   cudf::test::strings_column_wrapper expected_w{input.begin(), input.end(), replacement_v.begin()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
@@ -161,7 +159,7 @@ TEST_F(ReplaceNullsStringsTest, ReplaceNone)
   cudf::test::strings_column_wrapper expected_w{input.begin(), input.end()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected_w);
 }
@@ -170,8 +168,7 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplaceScalar)
 {
   std::vector<std::string> input{"", "", "", "", "", "", "", ""};
   std::vector<cudf::valid_type> input_v{0, 0, 0, 0, 0, 0, 0, 0};
-  std::unique_ptr<cudf::scalar> repl =
-    cudf::make_string_scalar("rep", cudf::get_default_stream(), mr());
+  std::unique_ptr<cudf::scalar> repl = cudf::make_string_scalar("rep");
   repl->set_valid_async(true, cudf::get_default_stream());
   std::vector<std::string> expected{"rep", "rep", "rep", "rep", "rep", "rep", "rep", "rep"};
 
@@ -179,7 +176,7 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplaceScalar)
   cudf::test::strings_column_wrapper expected_w{expected.begin(), expected.end()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, *repl, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, *repl));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 451cfa5bc9e..63460b0cb15 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -47,9 +47,9 @@ TEST_F(ReplaceErrorTest, SizeMismatch)
   cudf::test::fixed_width_column_wrapper<int32_t> values_to_replace_column{{10, 11, 12, 13}};
   cudf::test::fixed_width_column_wrapper<int32_t> replacement_values_column{{15, 16, 17}};
 
-  EXPECT_THROW(cudf::find_and_replace_all(
-                 input_column, values_to_replace_column, replacement_values_column, mr()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
+    cudf::logic_error);
 }
 
 // Error: column type mismatch
@@ -59,9 +59,9 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
   cudf::test::fixed_width_column_wrapper<float> values_to_replace_column{{10, 11, 12}};
   cudf::test::fixed_width_column_wrapper<int32_t> replacement_values_column{{15, 16, 17}};
 
-  EXPECT_THROW(cudf::find_and_replace_all(
-                 input_column, values_to_replace_column, replacement_values_column, mr()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
+    cudf::logic_error);
 }
 
 // Error: nulls in old-values
@@ -72,9 +72,9 @@ TEST_F(ReplaceErrorTest, NullInOldValues)
                                                                            {0, 1, 0, 1}};
   cudf::test::fixed_width_column_wrapper<int32_t> replacement_values_column{{15, 16, 17, 18}};
 
-  EXPECT_THROW(cudf::find_and_replace_all(
-                 input_column, values_to_replace_column, replacement_values_column, mr()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
+    cudf::logic_error);
 }
 
 struct ReplaceStringsTest : public cudf::test::BaseFixture {};
@@ -93,7 +93,7 @@ TEST_F(ReplaceStringsTest, Strings)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "b", "c", "d", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 1, 1, 1, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -117,7 +117,7 @@ TEST_F(ReplaceStringsTest, StringsReplacementNulls)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "", "c", "d", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 0, 1, 1, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -143,7 +143,7 @@ TEST_F(ReplaceStringsTest, StringsResultAllNulls)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   cudf::test::strings_column_wrapper expected_wrapper{
     expected.begin(), expected.end(), ex_valid.begin()};
 
@@ -167,7 +167,7 @@ TEST_F(ReplaceStringsTest, StringsResultAllEmpty)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   cudf::test::strings_column_wrapper expected_wrapper{
     expected.begin(), expected.end(), ex_valid.begin()};
 
@@ -188,7 +188,7 @@ TEST_F(ReplaceStringsTest, StringsInputNulls)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "y", "", "", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 1, 0, 0, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -213,7 +213,7 @@ TEST_F(ReplaceStringsTest, StringsInputAndReplacementNulls)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "", "", "", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 0, 0, 0, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -236,7 +236,7 @@ TEST_F(ReplaceStringsTest, StringsEmptyReplacement)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"a", "b", "", "", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 1, 0, 0, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -281,7 +281,7 @@ TEST_F(ReplaceStringsTest, StringsLargeScale)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_wrapper);
 }
diff --git a/cpp/tests/streams/replace_test.cpp b/cpp/tests/streams/replace_test.cpp
new file mode 100644
index 00000000000..c794f99b6f6
--- /dev/null
+++ b/cpp/tests/streams/replace_test.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+class ReplaceTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReplaceTest, ReplaceNullsColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  cudf::test::fixed_width_column_wrapper<int> replacement({1, 1, 1, 1, 1});
+  cudf::replace_nulls(input, replacement, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ReplaceNullsScalar)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  auto replacement = cudf::numeric_scalar<int>(1, true, cudf::test::get_default_stream());
+  cudf::replace_nulls(input, replacement, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ReplaceNullsPolicy)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  cudf::replace_nulls(input, cudf::replace_policy::FOLLOWING, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ReplaceNansColumn)
+{
+  auto nan          = std::numeric_limits<double>::quiet_NaN();
+  auto input_column = cudf::test::make_type_param_vector<double>({0.0, 0.0, nan, nan, nan});
+  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
+  cudf::test::fixed_width_column_wrapper<double> replacement({0, 1, 2, 3, 4});
+  cudf::replace_nans(input, replacement, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ReplaceNansScalar)
+{
+  auto nan          = std::numeric_limits<double>::quiet_NaN();
+  auto input_column = cudf::test::make_type_param_vector<double>({0.0, 0.0, nan, nan, nan});
+  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
+  auto replacement = cudf::numeric_scalar<double>(4, true, cudf::test::get_default_stream());
+  cudf::replace_nans(input, replacement, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, FindAndReplaceAll)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int> values_to_replace({0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int> replacement_values({1, 1, 1, 1, 1});
+  cudf::find_and_replace_all(
+    input, values_to_replace, replacement_values, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ClampWithReplace)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+  auto low          = cudf::numeric_scalar<int>(3, true, cudf::test::get_default_stream());
+  auto low_replace  = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  auto high         = cudf::numeric_scalar<int>(7, true, cudf::test::get_default_stream());
+  auto high_replace = cudf::numeric_scalar<int>(6, true, cudf::test::get_default_stream());
+  cudf::clamp(input, low, low_replace, high, high_replace, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, Clamp)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+  auto low  = cudf::numeric_scalar<int>(3, true, cudf::test::get_default_stream());
+  auto high = cudf::numeric_scalar<int>(7, true, cudf::test::get_default_stream());
+  cudf::clamp(input, low, high, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, NormalizeNansAndZeros)
+{
+  auto nan          = std::numeric_limits<double>::quiet_NaN();
+  auto input_column = cudf::test::make_type_param_vector<double>({-0.0, 0.0, -nan, nan, nan});
+  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
+  cudf::normalize_nans_and_zeros(static_cast<cudf::column_view>(input),
+                                 cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, NormalizeNansAndZerosMutable)
+{
+  auto nan          = std::numeric_limits<double>::quiet_NaN();
+  auto input_column = cudf::test::make_type_param_vector<double>({-0.0, 0.0, -nan, nan, nan});
+  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
+  cudf::normalize_nans_and_zeros(static_cast<cudf::mutable_column_view>(input),
+                                 cudf::test::get_default_stream());
+}

From bbbb143be086a85cc56f01157b5e94615f50c307 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 1 Sep 2023 21:33:10 -0500
Subject: [PATCH 140/230] Use cudf::thread_index_type in concatenate.cu.
 (#13906)

This PR uses `cudf::thread_index_type` in `concatenate.cu` to avoid risk of overflow.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13906
---
 cpp/src/copying/concatenate.cu | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 35f06e47436..d08c3025553 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -118,13 +118,14 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
                                          size_type number_of_mask_bits,
                                          size_type* out_valid_count)
 {
-  size_type mask_index = threadIdx.x + blockIdx.x * blockDim.x;
-
-  auto active_mask = __ballot_sync(0xFFFF'FFFFu, mask_index < number_of_mask_bits);
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto active_mask  = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits);
 
   size_type warp_valid_count = 0;
 
-  while (mask_index < number_of_mask_bits) {
+  while (tidx < number_of_mask_bits) {
+    auto const mask_index = static_cast<cudf::size_type>(tidx);
     size_type const source_view_index =
       thrust::upper_bound(
         thrust::seq, output_offsets, output_offsets + number_of_views, mask_index) -
@@ -141,8 +142,8 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
       warp_valid_count += __popc(new_word);
     }
 
-    mask_index += blockDim.x * gridDim.x;
-    active_mask = __ballot_sync(active_mask, mask_index < number_of_mask_bits);
+    tidx += stride;
+    active_mask = __ballot_sync(active_mask, tidx < number_of_mask_bits);
   }
 
   using detail::single_lane_block_sum_reduce;
@@ -195,7 +196,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
   auto const output_size = output_view.size();
   auto* output_data      = output_view.data<T>();
 
-  int64_t output_index       = threadIdx.x + blockIdx.x * blockDim.x;
+  auto output_index          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride          = cudf::detail::grid_1d::grid_stride();
   size_type warp_valid_count = 0;
 
   unsigned active_mask;
@@ -224,7 +226,7 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
       warp_valid_count += __popc(new_word);
     }
 
-    output_index += blockDim.x * gridDim.x;
+    output_index += stride;
     if (Nullable) { active_mask = __ballot_sync(active_mask, output_index < output_size); }
   }
 

From 0c829cc0b868c288c3591771d555617d4d978ce3 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 1 Sep 2023 21:38:11 -0500
Subject: [PATCH 141/230] Use cudf::thread_index_type in replace.cu. (#13905)

This PR uses `cudf::thread_index_type` in `replace.cu` to avoid risk of overflow.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/13905
---
 cpp/src/replace/replace.cu | 91 ++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 38 deletions(-)

diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 07eefdc27c6..9341929de44 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -127,40 +127,42 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input,
                                            cudf::size_type* __restrict__ output_valid_count)
 {
   cudf::size_type nrows = input.size();
-  cudf::size_type i     = blockIdx.x * blockDim.x + threadIdx.x;
+  auto tid              = cudf::detail::grid_1d::global_thread_id();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
   uint32_t active_mask  = 0xffff'ffffu;
-  active_mask           = __ballot_sync(active_mask, i < nrows);
+  active_mask           = __ballot_sync(active_mask, tid < nrows);
   auto const lane_id{threadIdx.x % cudf::detail::warp_size};
   uint32_t valid_sum{0};
 
-  while (i < nrows) {
+  while (tid < nrows) {
+    auto const idx      = static_cast<cudf::size_type>(tid);
     bool input_is_valid = true;
 
-    if (input_has_nulls) input_is_valid = input.is_valid_nocheck(i);
+    if (input_has_nulls) input_is_valid = input.is_valid_nocheck(idx);
     bool output_is_valid = input_is_valid;
 
     if (input_is_valid) {
-      int result               = get_new_string_value(i, input, values_to_replace, replacement);
-      cudf::string_view output = (result == -1) ? input.element<cudf::string_view>(i)
+      int result               = get_new_string_value(idx, input, values_to_replace, replacement);
+      cudf::string_view output = (result == -1) ? input.element<cudf::string_view>(idx)
                                                 : replacement.element<cudf::string_view>(result);
-      offsets.data<cudf::size_type>()[i] = output.size_bytes();
-      indices.data<cudf::size_type>()[i] = result;
+      offsets.data<cudf::size_type>()[idx] = output.size_bytes();
+      indices.data<cudf::size_type>()[idx] = result;
       if (replacement_has_nulls && result != -1) {
         output_is_valid = replacement.is_valid_nocheck(result);
       }
     } else {
-      offsets.data<cudf::size_type>()[i] = 0;
-      indices.data<cudf::size_type>()[i] = -1;
+      offsets.data<cudf::size_type>()[idx] = 0;
+      indices.data<cudf::size_type>()[idx] = -1;
     }
 
     uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
     if (0 == lane_id) {
-      output_valid[cudf::word_index(i)] = bitmask;
+      output_valid[cudf::word_index(idx)] = bitmask;
       valid_sum += __popc(bitmask);
     }
 
-    i += blockDim.x * gridDim.x;
-    active_mask = __ballot_sync(active_mask, i < nrows);
+    tid += stride;
+    active_mask = __ballot_sync(active_mask, tid < nrows);
   }
 
   // Compute total valid count for this block and add it to global count
@@ -189,27 +191,32 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input,
                                             cudf::mutable_column_device_view indices)
 {
   cudf::size_type nrows = input.size();
-  cudf::size_type i     = blockIdx.x * blockDim.x + threadIdx.x;
+  auto tid              = cudf::detail::grid_1d::global_thread_id();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
 
-  while (i < nrows) {
-    bool output_is_valid = true;
-    bool input_is_valid  = true;
-    cudf::size_type idx  = indices.element<cudf::size_type>(i);
+  while (tid < nrows) {
+    auto const idx         = static_cast<cudf::size_type>(tid);
+    auto const replace_idx = indices.element<cudf::size_type>(idx);
+    bool output_is_valid   = true;
+    bool input_is_valid    = true;
 
     if (input_has_nulls) {
-      input_is_valid  = input.is_valid_nocheck(i);
+      input_is_valid  = input.is_valid_nocheck(idx);
       output_is_valid = input_is_valid;
     }
-    if (replacement_has_nulls && idx != -1) { output_is_valid = replacement.is_valid_nocheck(idx); }
+    if (replacement_has_nulls && replace_idx != -1) {
+      output_is_valid = replacement.is_valid_nocheck(replace_idx);
+    }
     if (output_is_valid) {
-      cudf::string_view output = (idx == -1) ? input.element<cudf::string_view>(i)
-                                             : replacement.element<cudf::string_view>(idx);
-      std::memcpy(strings.data<char>() + offsets.data<cudf::size_type>()[i],
+      cudf::string_view output = (replace_idx == -1)
+                                   ? input.element<cudf::string_view>(idx)
+                                   : replacement.element<cudf::string_view>(replace_idx);
+      std::memcpy(strings.data<char>() + offsets.data<cudf::size_type>()[idx],
                   output.data(),
                   output.size_bytes());
     }
 
-    i += blockDim.x * gridDim.x;
+    tid += stride;
   }
 }
 
@@ -247,23 +254,25 @@ __global__ void replace_kernel(cudf::column_device_view input,
 {
   T* __restrict__ output_data = output.data<T>();
 
-  cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffffu;
-  active_mask          = __ballot_sync(active_mask, i < nrows);
+  active_mask          = __ballot_sync(active_mask, tid < nrows);
   auto const lane_id{threadIdx.x % cudf::detail::warp_size};
   uint32_t valid_sum{0};
 
-  while (i < nrows) {
+  while (tid < nrows) {
+    auto const idx = static_cast<cudf::size_type>(tid);
     bool output_is_valid{true};
     bool input_is_valid{true};
     if (input_has_nulls) {
-      input_is_valid  = input.is_valid_nocheck(i);
+      input_is_valid  = input.is_valid_nocheck(idx);
       output_is_valid = input_is_valid;
     }
     if (input_is_valid)
-      thrust::tie(output_data[i], output_is_valid) = get_new_value<T, replacement_has_nulls>(
-        i,
+      thrust::tie(output_data[idx], output_is_valid) = get_new_value<T, replacement_has_nulls>(
+        idx,
         input.data<T>(),
         values_to_replace.data<T>(),
         values_to_replace.data<T>() + values_to_replace.size(),
@@ -274,13 +283,13 @@ __global__ void replace_kernel(cudf::column_device_view input,
     if (input_has_nulls or replacement_has_nulls) {
       uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
       if (0 == lane_id) {
-        output.set_mask_word(cudf::word_index(i), bitmask);
+        output.set_mask_word(cudf::word_index(idx), bitmask);
         valid_sum += __popc(bitmask);
       }
     }
 
-    i += blockDim.x * gridDim.x;
-    active_mask = __ballot_sync(active_mask, i < nrows);
+    tid += stride;
+    active_mask = __ballot_sync(active_mask, tid < nrows);
   }
   if (input_has_nulls or replacement_has_nulls) {
     // Compute total valid count for this block and add it to global count
@@ -384,10 +393,16 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   }
 
   // Create new offsets column to use in kernel
-  std::unique_ptr<cudf::column> sizes = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::INT32), input_col.size(), cudf::mask_state::UNALLOCATED, stream);
-  std::unique_ptr<cudf::column> indices = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::INT32), input_col.size(), cudf::mask_state::UNALLOCATED, stream);
+  std::unique_ptr<cudf::column> sizes =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                              input_col.size(),
+                              cudf::mask_state::UNALLOCATED,
+                              stream);
+  std::unique_ptr<cudf::column> indices =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                              input_col.size(),
+                              cudf::mask_state::UNALLOCATED,
+                              stream);
 
   auto sizes_view   = sizes->mutable_view();
   auto indices_view = indices->mutable_view();
@@ -413,7 +428,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
     valid_count);
 
   auto [offsets, bytes] = cudf::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
+    sizes_view.begin<cudf::size_type>(), sizes_view.end<cudf::size_type>(), stream, mr);
   auto offsets_view   = offsets->mutable_view();
   auto device_offsets = cudf::mutable_column_device_view::create(offsets_view, stream);
 

From c51633627ee7087542ad4c315c0e139dea58e408 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Sun, 3 Sep 2023 02:20:33 -0400
Subject: [PATCH 142/230] Use cudf::make_empty_column instead of column_view
 constructor (#14030)

Replaces places where the `cudf::column_view(type,size,...)` constructor was used to create an empty view with a call to `cudf::make_column_view(type)->view()`.

This helps minimize the dependency on calling the constructors directly as part of the work needed for #13733 which may require an update to the `column_view` classes and its constructor(s).

Most of the changes occur in strings gtests source files.
No functionality or behavior has changed.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14030
---
 cpp/src/io/json/write_json.cu                  |  2 +-
 cpp/tests/copying/gather_str_tests.cpp         | 10 ++++------
 cpp/tests/reshape/interleave_columns_tests.cpp |  2 +-
 cpp/tests/strings/array_tests.cpp              | 18 +++++++++---------
 cpp/tests/strings/attrs_tests.cpp              |  4 ++--
 cpp/tests/strings/booleans_tests.cpp           |  9 ++++-----
 cpp/tests/strings/case_tests.cpp               |  4 ++--
 .../strings/combine/concatenate_tests.cpp      | 12 +++++-------
 .../strings/combine/join_strings_tests.cpp     |  4 ++--
 cpp/tests/strings/concatenate_tests.cpp        | 10 ++++------
 cpp/tests/strings/datetime_tests.cpp           |  8 +++-----
 cpp/tests/strings/durations_tests.cpp          |  8 +++-----
 cpp/tests/strings/fill_tests.cpp               |  3 +--
 cpp/tests/strings/find_multiple_tests.cpp      | 10 ++++------
 cpp/tests/strings/find_tests.cpp               |  7 +++----
 cpp/tests/strings/integers_tests.cpp           |  9 ++++-----
 cpp/tests/strings/ipv4_tests.cpp               |  3 ++-
 cpp/tests/strings/pad_tests.cpp                |  4 ++--
 cpp/tests/strings/replace_tests.cpp            |  4 ++--
 cpp/tests/strings/reverse_tests.cpp            |  4 ++--
 cpp/tests/strings/slice_tests.cpp              | 10 +++++-----
 cpp/tests/strings/split_tests.cpp              |  8 ++++----
 cpp/tests/strings/strip_tests.cpp              |  4 ++--
 cpp/tests/strings/translate_tests.cpp          |  4 ++--
 cpp/tests/strings/urls_tests.cpp               |  8 ++++----
 cpp/tests/text/ngrams_tests.cpp                |  4 ++--
 26 files changed, 79 insertions(+), 94 deletions(-)

diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index ffb4a7cd87b..1e44522ed33 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -582,7 +582,7 @@ struct column_to_strings_fn {
     return cudf::strings::detail::from_timestamps(
       column,
       format,
-      strings_column_view(column_view{data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
+      strings_column_view(make_empty_column(type_id::STRING)->view()),
       stream_,
       mr_);
   }
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index 41251b028ae..22af600ab96 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -133,10 +133,9 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
 
 TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING);
   cudf::test::fixed_width_column_wrapper<cudf::size_type> gather_map;
-  auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
+  auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column->view()}),
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
@@ -147,11 +146,10 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
 
 TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING);
   cudf::test::fixed_width_column_wrapper<int32_t> gather_map({0});
   cudf::test::strings_column_wrapper expected{std::pair<std::string, bool>{"", false}};
-  auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
+  auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column->view()}),
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index e2697567c38..eba6c961bbb 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -189,7 +189,7 @@ struct InterleaveStringsColumnsTest : public cudf::test::BaseFixture {};
 
 TEST_F(InterleaveStringsColumnsTest, ZeroSizedColumns)
 {
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const col0 = cudf::make_empty_column(cudf::type_id::STRING)->view();
 
   auto results = cudf::interleave_columns(cudf::table_view{{col0}});
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index ecc38dfd26e..c7ceb899833 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -47,8 +47,8 @@ TEST_F(StringsColumnTest, Sort)
 
 TEST_F(StringsColumnTest, SortZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto results = cudf::sort(cudf::table_view({zero_size_strings_column}));
   cudf::test::expect_column_empty(results->view().column(0));
 }
@@ -117,8 +117,8 @@ INSTANTIATE_TEST_CASE_P(StringsColumnTest,
 
 TEST_F(StringsColumnTest, SliceZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto scol    = cudf::slice(zero_size_strings_column, {0, 0});
   auto results = std::make_unique<cudf::column>(scol.front());
   cudf::test::expect_column_empty(results->view());
@@ -141,8 +141,8 @@ TEST_F(StringsColumnTest, Gather)
 
 TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   cudf::column_view map_view(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
   auto results = cudf::gather(cudf::table_view{{zero_size_strings_column}}, map_view)->release();
   cudf::test::expect_column_empty(results.front()->view());
@@ -193,9 +193,9 @@ TEST_F(StringsColumnTest, ScatterScalar)
 
 TEST_F(StringsColumnTest, ScatterZeroSizeStringsColumn)
 {
-  cudf::column_view source(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  cudf::column_view target(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  cudf::column_view scatter_map(cudf::data_type{cudf::type_id::INT8}, 0, nullptr, nullptr, 0);
+  auto const source      = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto const target      = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto const scatter_map = cudf::make_empty_column(cudf::type_id::INT8)->view();
 
   auto results = cudf::scatter(cudf::table_view({source}), scatter_map, cudf::table_view({target}));
   cudf::test::expect_column_empty(results->view().column(0));
diff --git a/cpp/tests/strings/attrs_tests.cpp b/cpp/tests/strings/attrs_tests.cpp
index 4f2fc485388..c5f38697f00 100644
--- a/cpp/tests/strings/attrs_tests.cpp
+++ b/cpp/tests/strings/attrs_tests.cpp
@@ -48,8 +48,8 @@ TEST_F(StringsAttributesTest, CodePoints)
 
 TEST_F(StringsAttributesTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   cudf::column_view expected_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
 
diff --git a/cpp/tests/strings/booleans_tests.cpp b/cpp/tests/strings/booleans_tests.cpp
index 21c9f6c70e8..0c7fc992065 100644
--- a/cpp/tests/strings/booleans_tests.cpp
+++ b/cpp/tests/strings/booleans_tests.cpp
@@ -66,16 +66,15 @@ TEST_F(StringsConvertTest, FromBooleans)
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnBoolean)
 {
-  cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::BOOL8}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::from_booleans(zero_size_column);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::BOOL8)->view();
+  auto results                = cudf::strings::from_booleans(zero_size_column);
   cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeBooleansColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::to_booleans(zero_size_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto results                        = cudf::strings::to_booleans(zero_size_strings_column);
   EXPECT_EQ(0, results->size());
 }
 
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index 5e2aa0584be..1d82d785ae8 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -262,8 +262,8 @@ TEST_F(StringsCaseTest, LongStrings)
 
 TEST_F(StringsCaseTest, EmptyStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
 
   auto results = cudf::strings::to_lower(strings_view);
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index 37cb7302a8e..95993e6ecbc 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -149,8 +149,7 @@ TEST_F(StringsCombineTest, ConcatenateSkipNulls)
 
 TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   std::vector<cudf::column_view> strings_columns;
   strings_columns.push_back(zero_size_strings_column);
   strings_columns.push_back(zero_size_strings_column);
@@ -161,8 +160,8 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
 
 TEST_F(StringsCombineTest, SingleColumnErrorCheck)
 {
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0}}), cudf::logic_error);
+  auto const col0 = cudf::make_empty_column(cudf::type_id::STRING);
+  EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0->view()}}), cudf::logic_error);
 }
 
 struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture {};
@@ -180,7 +179,7 @@ TEST_F(StringsConcatenateWithColSeparatorTest, ExceptionTests)
   }
 
   {
-    cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+    auto const col0 = cudf::make_empty_column(cudf::type_id::STRING)->view();
     cudf::test::fixed_width_column_wrapper<int64_t> col1{{1}};
 
     EXPECT_THROW(
@@ -200,8 +199,7 @@ TEST_F(StringsConcatenateWithColSeparatorTest, ExceptionTests)
 
 TEST_F(StringsConcatenateWithColSeparatorTest, ZeroSizedColumns)
 {
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-
+  auto const col0 = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(col0));
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp
index d413c50f122..ecc7432201f 100644
--- a/cpp/tests/strings/combine/join_strings_tests.cpp
+++ b/cpp/tests/strings/combine/join_strings_tests.cpp
@@ -73,8 +73,8 @@ TEST_F(JoinStringsTest, JoinLongStrings)
 
 TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::join_strings(strings_view);
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp
index e40a90685c4..5cf4015b9e9 100644
--- a/cpp/tests/strings/concatenate_tests.cpp
+++ b/cpp/tests/strings/concatenate_tests.cpp
@@ -50,8 +50,8 @@ TEST_F(StringsConcatenateTest, Concatenate)
   cudf::test::strings_column_wrapper strings2(h_strings.data() + 6, h_strings.data() + 10);
   cudf::test::strings_column_wrapper strings3(h_strings.data() + 10,
                                               h_strings.data() + h_strings.size());
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
 
   std::vector<cudf::column_view> strings_columns;
   strings_columns.push_back(strings1);
@@ -67,8 +67,7 @@ TEST_F(StringsConcatenateTest, Concatenate)
 
 TEST_F(StringsConcatenateTest, ZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   std::vector<cudf::column_view> strings_columns;
   strings_columns.push_back(zero_size_strings_column);
   strings_columns.push_back(zero_size_strings_column);
@@ -79,8 +78,7 @@ TEST_F(StringsConcatenateTest, ZeroSizeStringsColumns)
 
 TEST_F(StringsConcatenateTest, ZeroSizeStringsPlusNormal)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   std::vector<cudf::column_view> strings_columns;
   strings_columns.push_back(zero_size_strings_column);
 
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index 8ad1858fa36..bb5c96a09bf 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -605,13 +605,11 @@ TEST_F(StringsDatetimeTest, FromTimestampAllSpecifiers)
 
 TEST_F(StringsDatetimeTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::from_timestamps(zero_size_column);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::TIMESTAMP_SECONDS)->view();
+  auto results                = cudf::strings::from_timestamps(zero_size_column);
   cudf::test::expect_column_empty(results->view());
 
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   results = cudf::strings::to_timestamps(cudf::strings_column_view(zero_size_strings_column),
                                          cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS},
                                          "%Y");
diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp
index f9026f5f624..0c7a1ad8042 100644
--- a/cpp/tests/strings/durations_tests.cpp
+++ b/cpp/tests/strings/durations_tests.cpp
@@ -728,13 +728,11 @@ TEST_F(StringsDurationsTest, ParseEscapeCharacters)
 
 TEST_F(StringsDurationsTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::DURATION_SECONDS}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::from_durations(zero_size_column);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::DURATION_SECONDS)->view();
+  auto results                = cudf::strings::from_durations(zero_size_column);
   cudf::test::expect_column_empty(results->view());
 
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   results = cudf::strings::to_durations(cudf::strings_column_view(zero_size_strings_column),
                                         cudf::data_type{cudf::type_id::DURATION_SECONDS},
                                         "%S");
diff --git a/cpp/tests/strings/fill_tests.cpp b/cpp/tests/strings/fill_tests.cpp
index 74254b38d2f..aadd68402c8 100644
--- a/cpp/tests/strings/fill_tests.cpp
+++ b/cpp/tests/strings/fill_tests.cpp
@@ -69,8 +69,7 @@ TEST_F(StringsFillTest, Fill)
 
 TEST_F(StringsFillTest, ZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto results = cudf::fill(zero_size_strings_column, 0, 0, cudf::string_scalar(""));
   cudf::test::expect_column_empty(results->view());
 }
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index 799bf9a3fcb..986f86d2b49 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -57,9 +57,8 @@ TEST_F(StringsFindMultipleTest, FindMultiple)
 
 TEST_F(StringsFindMultipleTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto strings_view                   = cudf::strings_column_view(zero_size_strings_column);
   std::vector<char const*> h_targets{""};
   cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
   auto targets_view = cudf::strings_column_view(targets);
@@ -73,9 +72,8 @@ TEST_F(StringsFindMultipleTest, ErrorTest)
   cudf::test::strings_column_wrapper strings({"this string intentionally left blank"}, {0});
   auto strings_view = cudf::strings_column_view(strings);
 
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto empty_view = cudf::strings_column_view(zero_size_strings_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto empty_view                     = cudf::strings_column_view(zero_size_strings_column);
   // targets must have at least one string
   EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error);
 
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index e64a368a952..5c0a5b760f5 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -250,10 +250,9 @@ TEST_F(StringsFindTest, EndsWith)
 
 TEST_F(StringsFindTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
-  auto results      = cudf::strings::find(strings_view, cudf::string_scalar("é"));
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto strings_view                   = cudf::strings_column_view(zero_size_strings_column);
+  auto results = cudf::strings::find(strings_view, cudf::string_scalar("é"));
   EXPECT_EQ(results->size(), 0);
   results = cudf::strings::rfind(strings_view, cudf::string_scalar("é"));
   EXPECT_EQ(results->size(), 0);
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 7a44ca9efba..59805f9cb6d 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -261,17 +261,16 @@ TEST_F(StringsConvertTest, FromInteger)
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::from_integers(zero_size_column);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::INT32)->view();
+  auto results                = cudf::strings::from_integers(zero_size_column);
   cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeIntegersColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto results =
-    cudf::strings::to_integers(zero_size_column, cudf::data_type{cudf::type_id::INT32});
+    cudf::strings::to_integers(zero_size_strings_column, cudf::data_type{cudf::type_id::INT32});
   EXPECT_EQ(0, results->size());
 }
 
diff --git a/cpp/tests/strings/ipv4_tests.cpp b/cpp/tests/strings/ipv4_tests.cpp
index 268806dd3cf..2b2d5730ca7 100644
--- a/cpp/tests/strings/ipv4_tests.cpp
+++ b/cpp/tests/strings/ipv4_tests.cpp
@@ -72,7 +72,8 @@ TEST_F(StringsConvertTest, IntegersToIPv4)
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnIPV4)
 {
-  cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::INT64}, 0, nullptr, nullptr, 0);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::INT64)->view();
+
   auto results = cudf::strings::integers_to_ipv4(zero_size_column);
   cudf::test::expect_column_empty(results->view());
   results = cudf::strings::ipv4_to_integers(results->view());
diff --git a/cpp/tests/strings/pad_tests.cpp b/cpp/tests/strings/pad_tests.cpp
index 8c07cb62c6b..81ec87a12a8 100644
--- a/cpp/tests/strings/pad_tests.cpp
+++ b/cpp/tests/strings/pad_tests.cpp
@@ -97,8 +97,8 @@ TEST_F(StringsPadTest, PaddingBoth)
 
 TEST_F(StringsPadTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::pad(strings_view, 5);
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 4a45773a29a..f143983aded 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -470,8 +470,8 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
 
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::replace(
     strings_view, cudf::string_scalar("not"), cudf::string_scalar("pertinent"));
diff --git a/cpp/tests/strings/reverse_tests.cpp b/cpp/tests/strings/reverse_tests.cpp
index 8c3f87709ff..3df42b61ebf 100644
--- a/cpp/tests/strings/reverse_tests.cpp
+++ b/cpp/tests/strings/reverse_tests.cpp
@@ -45,8 +45,8 @@ TEST_F(StringsReverseTest, Reverse)
 
 TEST_F(StringsReverseTest, EmptyStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto results = cudf::strings::reverse(cudf::strings_column_view(zero_size_strings_column));
   auto view    = results->view();
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp
index 1162bbb6b13..92230d06672 100644
--- a/cpp/tests/strings/slice_tests.cpp
+++ b/cpp/tests/strings/slice_tests.cpp
@@ -288,15 +288,15 @@ TEST_F(StringsSliceTest, Error)
 
 TEST_F(StringsSliceTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto strings_view                   = cudf::strings_column_view(zero_size_strings_column);
 
   auto results = cudf::strings::slice_strings(strings_view, 1, 2);
   cudf::test::expect_column_empty(results->view());
 
-  cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
-  cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
+  auto const starts_column = cudf::make_empty_column(cudf::type_id::INT32)->view();
+  auto const stops_column  = cudf::make_empty_column(cudf::type_id::INT32)->view();
+
   results = cudf::strings::slice_strings(strings_view, starts_column, stops_column);
   cudf::test::expect_column_empty(results->view());
 }
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index e8c4ec8e19c..445e283ef45 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -676,8 +676,8 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
 
 TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto prog    = cudf::strings::regex_program::create("\\s");
   auto results = cudf::strings::split(zero_size_strings_column);
   EXPECT_TRUE(results->num_columns() == 1);
@@ -912,8 +912,8 @@ TEST_F(StringsSplitTest, RPartitionWhitespace)
 
 TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto results = cudf::strings::partition(zero_size_strings_column);
   EXPECT_TRUE(results->num_columns() == 0);
   results = cudf::strings::rpartition(zero_size_strings_column);
diff --git a/cpp/tests/strings/strip_tests.cpp b/cpp/tests/strings/strip_tests.cpp
index bd6d587e0a1..63179474944 100644
--- a/cpp/tests/strings/strip_tests.cpp
+++ b/cpp/tests/strings/strip_tests.cpp
@@ -92,8 +92,8 @@ TEST_F(StringsStripTest, StripBoth)
 
 TEST_F(StringsStripTest, EmptyStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::strip(strings_view);
   auto view         = results->view();
diff --git a/cpp/tests/strings/translate_tests.cpp b/cpp/tests/strings/translate_tests.cpp
index 6b4288196f9..ab3973242c6 100644
--- a/cpp/tests/strings/translate_tests.cpp
+++ b/cpp/tests/strings/translate_tests.cpp
@@ -62,8 +62,8 @@ TEST_F(StringsTranslateTest, Translate)
 
 TEST_F(StringsTranslateTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> translate_table;
   auto results = cudf::strings::translate(strings_view, translate_table);
diff --git a/cpp/tests/strings/urls_tests.cpp b/cpp/tests/strings/urls_tests.cpp
index 22147d33569..2aec72160cc 100644
--- a/cpp/tests/strings/urls_tests.cpp
+++ b/cpp/tests/strings/urls_tests.cpp
@@ -226,10 +226,10 @@ TEST_F(StringsConvertTest, UrlDecodeLargeStrings)
 
 TEST_F(StringsConvertTest, ZeroSizeUrlStringsColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::url_encode(zero_size_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
+  auto results = cudf::strings::url_encode(zero_size_strings_column);
   cudf::test::expect_column_empty(results->view());
-  results = cudf::strings::url_decode(zero_size_column);
+  results = cudf::strings::url_decode(zero_size_strings_column);
   cudf::test::expect_column_empty(results->view());
 }
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index feb0cf538b0..323b3eed3e2 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -101,8 +101,8 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
 
 TEST_F(TextGenerateNgramsTest, Empty)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
   cudf::test::expect_column_empty(results->view());
   results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));

From 3e5f019697252f6c300639a09eb67ff11a80ac43 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 4 Sep 2023 07:11:13 -1000
Subject: [PATCH 143/230] Raise NotImplementedError for Categoricals with
 timezones (#14032)

Currently `cudf.from_pandas` with a pandas Categorical with datetimetz type will drop the timezone information (due to pyarrow)

```python
In [5]: import pandas as pd

In [6]: ci = pd.CategoricalIndex(pd.date_range("2016-01-01 01:01:00", periods=5, freq="D").tz_localize("UTC"))

In [7]: ci
Out[7]:
CategoricalIndex(['2016-01-01 01:01:00+00:00', '2016-01-02 01:01:00+00:00',
                  '2016-01-03 01:01:00+00:00', '2016-01-04 01:01:00+00:00',
                  '2016-01-05 01:01:00+00:00'],
                 categories=[2016-01-01 01:01:00+00:00, 2016-01-02 01:01:00+00:00, 2016-01-03 01:01:00+00:00, 2016-01-04 01:01:00+00:00, 2016-01-05 01:01:00+00:00], ordered=False, dtype='category')

In [8]: ci_cudf = cudf.from_pandas(ci)

In [10]: ci_cudf
Out[10]:
CategoricalIndex(['2016-01-01 01:01:00', '2016-01-02 01:01:00',
                  '2016-01-03 01:01:00', '2016-01-04 01:01:00',
                  '2016-01-05 01:01:00'],
                 categories=[2016-01-01 01:01:00, 2016-01-02 01:01:00, 2016-01-03 01:01:00, 2016-01-04 01:01:00, 2016-01-05 01:01:00], ordered=False, dtype='category')
```

Like what is done with `IntervalIndex`, raises a `NotImplementedError` for now to avoid this wrong behavior.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14032
---
 python/cudf/cudf/core/column/column.py  | 31 ++++++++++++++++++++-----
 python/cudf/cudf/tests/test_datetime.py |  2 ++
 python/cudf/cudf/tests/test_interval.py | 11 +++++----
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index ad761ea8d18..9dde17a1045 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2033,9 +2033,19 @@ def as_column(
                     f"{arbitrary.dtype} is not supported. Convert first to "
                     f"{arbitrary.dtype.subtype}."
                 )
-        if is_categorical_dtype(arbitrary):
+        if is_categorical_dtype(arbitrary.dtype):
+            if isinstance(
+                arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
+            ):
+                raise NotImplementedError(
+                    "cuDF does not yet support timezone-aware datetimes"
+                )
             data = as_column(pa.array(arbitrary, from_pandas=True))
         elif is_interval_dtype(arbitrary.dtype):
+            if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype):
+                raise NotImplementedError(
+                    "cuDF does not yet support timezone-aware datetimes"
+                )
             data = as_column(pa.array(arbitrary, from_pandas=True))
         elif arbitrary.dtype == np.bool_:
             data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
@@ -2262,11 +2272,20 @@ def as_column(
     elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
         data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype)
     elif (
-        isinstance(arbitrary, pd.DatetimeIndex)
-        and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
-    ) or (
-        isinstance(arbitrary, pd.IntervalIndex)
-        and is_datetime64tz_dtype(arbitrary.dtype.subtype)
+        (
+            isinstance(arbitrary, pd.DatetimeIndex)
+            and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
+        )
+        or (
+            isinstance(arbitrary, pd.IntervalIndex)
+            and is_datetime64tz_dtype(arbitrary.dtype.subtype)
+        )
+        or (
+            isinstance(arbitrary, pd.CategoricalIndex)
+            and isinstance(
+                arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
+            )
+        )
     ):
         raise NotImplementedError(
             "cuDF does not yet support timezone-aware datetimes"
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index abcc057f823..b1685950241 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2095,6 +2095,8 @@ def test_construction_from_tz_timestamps(data):
         _ = cudf.Index(data)
     with pytest.raises(NotImplementedError):
         _ = cudf.DatetimeIndex(data)
+    with pytest.raises(NotImplementedError):
+        cudf.CategoricalIndex(data)
 
 
 @pytest.mark.parametrize("op", _cmpops)
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 9704be44b95..a27de60c2c5 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -167,17 +167,18 @@ def test_interval_index_unique():
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex])
 @pytest.mark.parametrize("tz", ["US/Eastern", None])
-def test_interval_with_datetime(tz):
+def test_interval_with_datetime(tz, box):
     dti = pd.date_range(
         start=pd.Timestamp("20180101", tz=tz),
         end=pd.Timestamp("20181231", tz=tz),
         freq="M",
     )
-    pidx = pd.IntervalIndex.from_breaks(dti)
+    pobj = box(pd.IntervalIndex.from_breaks(dti))
     if tz is None:
-        gidx = cudf.from_pandas(pidx)
-        assert_eq(pidx, gidx)
+        gobj = cudf.from_pandas(pobj)
+        assert_eq(pobj, gobj)
     else:
         with pytest.raises(NotImplementedError):
-            cudf.from_pandas(pidx)
+            cudf.from_pandas(pobj)

From 0b01fe49c8d5963e7be07e6dac2b78f842461db3 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 5 Sep 2023 21:22:33 +0100
Subject: [PATCH 144/230] Implement `sort_remaining` for `sort_index` (#14033)

Previously, the `sort_remaining` argument to `sort_index` was ignored. Passing `sort_remaining=False` would raise a `NotImplementedError`. Moreover, for a multiindex, `sort_remaining=True` was not handled correctly: if not all levels were requested as sorted, `sort_index` would behave as if `sort_remaining=False` had been passed.

To fix this case, construct the sort order based on first the provided levels and, if `sort_remaining=True`, the left-over levels (in index order).

To facilitate this, refactor the internal `_get_columns_by_label` function to always return a `Frame`-like object (previously, if we had a `Frame` we would get back a `ColumnAccessor`, and it was only for `IndexedFrame` and above that we'd get something of `Self`-like type back). This meant that calling `_get_sorted_inds` with `by != None` was not possible on an `Index` or `MultiIndex` (the code assumed we'd get a `Frame` back).

- Closes #14011

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14033
---
 python/cudf/cudf/core/dataframe.py        | 12 +++++----
 python/cudf/cudf/core/frame.py            |  4 +--
 python/cudf/cudf/core/indexed_frame.py    | 31 +++++++++++++----------
 python/cudf/cudf/core/series.py           |  8 +++---
 python/cudf/cudf/tests/test_multiindex.py | 23 +++++++++++++++++
 5 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e67604069f1..5a3d25a08a7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -36,7 +36,7 @@
 from pandas.core.dtypes.common import is_float, is_integer
 from pandas.io.formats import console
 from pandas.io.formats.printing import pprint_thing
-from typing_extensions import assert_never
+from typing_extensions import Self, assert_never
 
 import cudf
 import cudf.core.common
@@ -1830,13 +1830,15 @@ def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
     @_cudf_nvtx_annotate
-    def _get_columns_by_label(self, labels, downcast=False):
+    def _get_columns_by_label(
+        self, labels, *, downcast=False
+    ) -> Self | Series:
         """
         Return columns of dataframe by `labels`
 
         If downcast is True, try and downcast from a DataFrame to a Series
         """
-        new_data = super()._get_columns_by_label(labels, downcast)
+        ca = self._data.select_by_label(labels)
         if downcast:
             if is_scalar(labels):
                 nlevels = 1
@@ -1844,11 +1846,11 @@ def _get_columns_by_label(self, labels, downcast=False):
                 nlevels = len(labels)
             if self._data.multiindex is False or nlevels == self._data.nlevels:
                 out = self._constructor_sliced._from_data(
-                    new_data, index=self.index, name=labels
+                    ca, index=self.index, name=labels
                 )
                 return out
         out = self.__class__._from_data(
-            new_data, index=self.index, columns=new_data.to_pandas_index()
+            ca, index=self.index, columns=ca.to_pandas_index()
         )
         return out
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index b9f052e7626..6224793d6f1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -362,12 +362,12 @@ def equals(self, other):
         )
 
     @_cudf_nvtx_annotate
-    def _get_columns_by_label(self, labels, downcast=False):
+    def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
         """
         Returns columns of the Frame specified by `labels`
 
         """
-        return self._data.select_by_label(labels)
+        return self.__class__._from_data(self._data.select_by_label(labels))
 
     @property
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 33ac97d7ef8..69b25c51a66 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1526,7 +1526,9 @@ def sort_index(
         na_position : {'first', 'last'}, default 'last'
             Puts NaNs at the beginning if first; last puts NaNs at the end.
         sort_remaining : bool, default True
-            Not yet supported
+            When sorting a multiindex on a subset of its levels,
+            should entries be lexsorted by the remaining
+            (non-specified) levels as well?
         ignore_index : bool, default False
             if True, index will be replaced with RangeIndex.
         key : callable, optional
@@ -1592,11 +1594,6 @@ def sort_index(
         if kind is not None:
             raise NotImplementedError("kind is not yet supported")
 
-        if not sort_remaining:
-            raise NotImplementedError(
-                "sort_remaining == False is not yet supported"
-            )
-
         if key is not None:
             raise NotImplementedError("key is not yet supported.")
 
@@ -1609,16 +1606,22 @@ def sort_index(
                 if level is not None:
                     # Pandas doesn't handle na_position in case of MultiIndex.
                     na_position = "first" if ascending is True else "last"
-                    labels = [
-                        idx._get_level_label(lvl)
-                        for lvl in (level if is_list_like(level) else (level,))
-                    ]
-                    # Explicitly construct a Frame rather than using type(self)
-                    # to avoid constructing a SingleColumnFrame (e.g. Series).
-                    idx = Frame._from_data(idx._data.select_by_label(labels))
+                    if not is_list_like(level):
+                        level = [level]
+                    by = list(map(idx._get_level_label, level))
+                    if sort_remaining:
+                        handled = set(by)
+                        by.extend(
+                            filter(
+                                lambda n: n not in handled,
+                                self.index._data.names,
+                            )
+                        )
+                else:
+                    by = list(idx._data.names)
 
                 inds = idx._get_sorted_inds(
-                    ascending=ascending, na_position=na_position
+                    by=by, ascending=ascending, na_position=na_position
                 )
                 out = self._gather(
                     GatherMap.from_column_unchecked(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 2fef741ac09..78be3085754 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -797,17 +797,17 @@ def deserialize(cls, header, frames):
 
         return obj
 
-    def _get_columns_by_label(self, labels, downcast=False):
+    def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
         """Return the column specified by `labels`
 
         For cudf.Series, either the column, or an empty series is returned.
         Parameter `downcast` does not have effects.
         """
-        new_data = super()._get_columns_by_label(labels, downcast)
+        ca = self._data.select_by_label(labels)
 
         return (
-            self.__class__._from_data(data=new_data, index=self.index)
-            if len(new_data) > 0
+            self.__class__._from_data(data=ca, index=self.index)
+            if len(ca) > 0
             else self.__class__(dtype=self.dtype, name=self.name)
         )
 
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index eedc9b0c174..56bd7d709b7 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1897,3 +1897,26 @@ def test_multiindex_empty_slice_pandas_compatibility():
     with cudf.option_context("mode.pandas_compatible", True):
         actual = cudf.from_pandas(expected)
     assert_eq(expected, actual, exact=False)
+
+
+@pytest.mark.parametrize(
+    "levels",
+    itertools.chain.from_iterable(
+        itertools.permutations(range(3), n) for n in range(1, 4)
+    ),
+    ids=str,
+)
+def test_multiindex_sort_index_partial(levels):
+    df = pd.DataFrame(
+        {
+            "a": [3, 3, 3, 1, 1, 1, 2, 2],
+            "b": [4, 2, 7, -1, 11, -2, 7, 7],
+            "c": [4, 4, 2, 3, 3, 3, 1, 1],
+            "val": [1, 2, 3, 4, 5, 6, 7, 8],
+        }
+    ).set_index(["a", "b", "c"])
+    cdf = cudf.from_pandas(df)
+
+    expect = df.sort_index(level=levels, sort_remaining=True)
+    got = cdf.sort_index(level=levels, sort_remaining=True)
+    assert_eq(expect, got)

From c82a70807849188274d21b595d5ded818aad4464 Mon Sep 17 00:00:00 2001
From: Chong Gao <gaochong.gc@qq.com>
Date: Wed, 6 Sep 2023 10:57:10 +0800
Subject: [PATCH 145/230] Fix map column can not be non-nullable for java
 (#14003)

Make map column non-nullable for java.

Changes:
- Add a new method to pass nullable; Deprecate the old one.
- Update the tests.

Authors:
  - Chong Gao (https://github.com/res-life)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/14003
---
 .../ai/rapids/cudf/ColumnWriterOptions.java   | 30 +++++++++++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   |  6 ++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
index 2177f58c9de..a95c5f58f09 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
@@ -522,7 +522,11 @@ protected String[] getFlatColumnNames(String[] ret) {
    * Maps are List columns with a Struct named 'key_value' with a child named 'key' and a child
    * named 'value'. The caller of this method doesn't need to worry about this as this method will
    * take care of this without the knowledge of the caller.
+   *
+   * Note: This method always returns a nullabe column, cannot return non-nullable column.
+   * Do not use this, use the next function with the parameter `isNullable`.
    */
+  @Deprecated
   public static ColumnWriterOptions mapColumn(String name, ColumnWriterOptions key,
                                               ColumnWriterOptions value) {
     StructColumnWriterOptions struct = structBuilder("key_value").build();
@@ -537,6 +541,32 @@ public static ColumnWriterOptions mapColumn(String name, ColumnWriterOptions key
     return opt;
   }
 
+  /**
+   * Add a Map Column to the schema.
+   * <p>
+   * Maps are List columns with a Struct named 'key_value' with a child named 'key' and a child
+   * named 'value'. The caller of this method doesn't need to worry about this as this method will
+   * take care of this without the knowledge of the caller.
+   *
+   * Note: If this map column is a key of another map, should pass isNullable = false.
+   * e.g.: map1(map2(int, int), int) the map2 should be non-nullable.
+   *
+   * @param isNullable is the returned map nullable.
+   */
+  public static ColumnWriterOptions mapColumn(String name, ColumnWriterOptions key,
+                                              ColumnWriterOptions value, Boolean isNullable) {
+    if (key.isNullable) {
+      throw new IllegalArgumentException("key column can not be nullable");
+    }
+    StructColumnWriterOptions struct = structBuilder("key_value").build();
+    struct.childColumnOptions = new ColumnWriterOptions[]{key, value};
+    ColumnWriterOptions opt = listBuilder(name, isNullable)
+        .withStructColumn(struct)
+        .build();
+    opt.isMap = true;
+    return opt;
+  }
+
   /**
    * Creates a ListBuilder for column called 'name'
    */
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 5c0c738a20f..3740328615a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -8064,7 +8064,8 @@ void testParquetWriteMap() throws IOException {
     ParquetWriterOptions options = ParquetWriterOptions.builder()
         .withMapColumn(mapColumn("my_map",
             new ColumnWriterOptions("key0", false),
-            new ColumnWriterOptions("value0"))).build();
+            new ColumnWriterOptions("value0"),
+            true)).build();
     File f = File.createTempFile("test-map", ".parquet");
     List<HostColumnVector.StructData> list1 =
         Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "b")));
@@ -8562,7 +8563,8 @@ void testORCWriteMapChunked() throws IOException {
     ORCWriterOptions options = ORCWriterOptions.builder()
             .withMapColumn(mapColumn("my_map",
                     new ColumnWriterOptions("key0", false),
-                    new ColumnWriterOptions("value0"))).build();
+                    new ColumnWriterOptions("value0"),
+                    true)).build();
     File f = File.createTempFile("test-map", ".parquet");
     List<HostColumnVector.StructData> list1 =
             Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "b")));

From 1d7a77be153c09b007410d6dc8538705fbfd73ab Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 6 Sep 2023 12:13:27 -0400
Subject: [PATCH 146/230] Use `cudf::thread_index_type` in `merge.cu` (#13972)

This PR uses `cudf::thread_index_type` to avoid overflows.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13972
---
 cpp/src/merge/merge.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 5c54bb5661c..c0765b48205 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -78,11 +78,14 @@ __global__ void materialize_merged_bitmask_kernel(
   size_type const num_destination_rows,
   index_type const* const __restrict__ merged_indices)
 {
-  size_type destination_row = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const stride = detail::grid_1d::grid_stride();
 
-  auto active_threads = __ballot_sync(0xffff'ffffu, destination_row < num_destination_rows);
+  auto tid = detail::grid_1d::global_thread_id();
 
-  while (destination_row < num_destination_rows) {
+  auto active_threads = __ballot_sync(0xffff'ffffu, tid < num_destination_rows);
+
+  while (tid < num_destination_rows) {
+    auto const destination_row     = static_cast<size_type>(tid);
     auto const [src_side, src_row] = merged_indices[destination_row];
     bool const from_left{src_side == side::LEFT};
     bool source_bit_is_valid{true};
@@ -99,8 +102,8 @@ __global__ void materialize_merged_bitmask_kernel(
     // Only one thread writes output
     if (0 == threadIdx.x % warpSize) { out_validity[word_index(destination_row)] = result_mask; }
 
-    destination_row += blockDim.x * gridDim.x;
-    active_threads = __ballot_sync(active_threads, destination_row < num_destination_rows);
+    tid += stride;
+    active_threads = __ballot_sync(active_threads, tid < num_destination_rows);
   }
 }
 

From 609f894fcd53b99acf0889562e78e706cb7812d8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 6 Sep 2023 11:17:17 -0700
Subject: [PATCH 147/230] Temporary fix Parquet metadata with empty value
 string being ignored from writing (#14026)

When writing to Parquet files, Spark needs to write pairs of key-value strings into files' metadata. Sometimes the value strings are just an empty string. Such empty string is ignored from writing into the file, causing other applications (such as Spark) to read the value and interpret it as a `null` instead of an empty string as in the original input, as described in https://github.com/rapidsai/cudf/issues/14024. This is wrong and led to data corruption as I tested.

This PR intentionally modifies the empty value string into a space character to workaround the bug. This is a temporary fix while waiting for a better fix to be worked on.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/14026
---
 java/src/main/native/src/TableJni.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index f7ada4305db..b05fc9b7bc4 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1592,7 +1592,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     std::map<std::string, std::string> kv_metadata;
     std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) { return std::make_pair(key, value); });
+                   [](auto const &key, auto const &value) {
+                     // The metadata value will be ignored if it is empty.
+                     // We modify it into a space character to workaround such issue.
+                     return std::make_pair(key, value.empty() ? std::string(" ") : value);
+                   });
 
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
@@ -1638,7 +1642,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     std::map<std::string, std::string> kv_metadata;
     std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) { return std::make_pair(key, value); });
+                   [](auto const &key, auto const &value) {
+                     // The metadata value will be ignored if it is empty.
+                     // We modify it into a space character to workaround such issue.
+                     return std::make_pair(key, value.empty() ? std::string(" ") : value);
+                   });
 
     sink_info sink{output_path.get()};
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();

From ea59dbf74e4d962ac20ebb0d6d3b71eaaeaad494 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 6 Sep 2023 09:31:30 -1000
Subject: [PATCH 148/230] Raise NotImplementedError for to_datetime with z
 format (#14037)

Avoids timezone information from being dropped in `to_datetime` when the z directive is provided

```python
In [1]: import cudf

In [2]: fmt = '%Y-%m-%d %H:%M:%S %Z'
   ...: dates = ['2010-01-01 12:00:00 UTC', '2010-01-01 12:00:00 UTC']

In [3]: cudf.to_datetime(dates, format=fmt)
Out[3]: DatetimeIndex(['2010-01-01 12:00:00', '2010-01-01 12:00:00'], dtype='datetime64[ns]')
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14037
---
 python/cudf/cudf/core/tools/datetimes.py | 9 +++++++--
 python/cudf/cudf/tests/test_datetime.py  | 8 ++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 7c4b9810df2..a759f9dc3e1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -147,8 +147,13 @@ def to_datetime(
     if utc:
         raise NotImplementedError("utc is not yet implemented")
 
-    if format is not None and "%f" in format:
-        format = format.replace("%f", "%9f")
+    if format is not None:
+        if "%Z" in format or "%z" in format:
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
+        elif "%f" in format:
+            format = format.replace("%f", "%9f")
 
     try:
         if isinstance(arg, cudf.DataFrame):
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index b1685950241..4a4e9b67c2e 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2148,3 +2148,11 @@ def test_daterange_pandas_compatibility():
             "2010-01-01", "2010-02-01", periods=10, name="times"
         )
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("code", ["z", "Z"])
+def test_format_timezone_not_implemented(code):
+    with pytest.raises(NotImplementedError):
+        cudf.to_datetime(
+            ["2020-01-01 00:00:00 UTC"], format=f"%Y-%m-%d %H:%M:%S %{code}"
+        )

From e81d79e94f268499c8656eba1fe8de8122589780 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 6 Sep 2023 14:48:50 -0700
Subject: [PATCH 149/230] Expose streams in public search APIs (#14034)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14034
---
 cpp/include/cudf/search.hpp       | 11 ++++-
 cpp/src/search/contains_column.cu |  5 ++-
 cpp/src/search/contains_scalar.cu |  4 +-
 cpp/src/search/search_ordered.cu  | 10 ++---
 cpp/tests/CMakeLists.txt          |  1 +
 cpp/tests/streams/search_test.cpp | 69 +++++++++++++++++++++++++++++++
 6 files changed, 90 insertions(+), 10 deletions(-)
 create mode 100644 cpp/tests/streams/search_test.cpp

diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index fee22786d7a..49acce6a63b 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -63,6 +63,7 @@ namespace cudf {
  * @param needles Values for which to find the insert locations in the search space
  * @param column_order Vector of column sort order
  * @param null_precedence Vector of null_precedence enums needles
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A non-nullable column of elements containing the insertion points
  */
@@ -71,6 +72,7 @@ std::unique_ptr<column> lower_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -103,6 +105,7 @@ std::unique_ptr<column> lower_bound(
  * @param needles Values for which to find the insert locations in the search space
  * @param column_order Vector of column sort order
  * @param null_precedence Vector of null_precedence enums needles
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A non-nullable column of elements containing the insertion points
  */
@@ -111,6 +114,7 @@ std::unique_ptr<column> upper_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -128,9 +132,12 @@ std::unique_ptr<column> upper_bound(
  *
  * @param haystack The column containing search space
  * @param needle A scalar value to check for existence in the search space
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return true if the given `needle` value exists in the `haystack` column
  */
-bool contains(column_view const& haystack, scalar const& needle);
+bool contains(column_view const& haystack,
+              scalar const& needle,
+              rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Check if the given `needles` values exists in the `haystack` column.
@@ -149,12 +156,14 @@ bool contains(column_view const& haystack, scalar const& needle);
  *
  * @param haystack The column containing search space
  * @param needles A column of values to check for existence in the search space
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A BOOL column indicating if each element in `needles` exists in the search space
  */
 std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 08bcf8d48d8..4363bd212fe 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -154,10 +154,11 @@ std::unique_ptr<column> contains(column_view const& haystack,
 
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(haystack, needles, cudf::get_default_stream(), mr);
+  return detail::contains(haystack, needles, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 7c16a1b12ef..0b344ec347b 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -160,10 +160,10 @@ bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_strea
 
 }  // namespace detail
 
-bool contains(column_view const& haystack, scalar const& needle)
+bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(haystack, needle, cudf::get_default_stream());
+  return detail::contains(haystack, needle, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index bf0eb8d46f8..3b5dbef0401 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,22 +144,22 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     table_view const& needles,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::lower_bound(
-    haystack, needles, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::lower_bound(haystack, needles, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     table_view const& needles,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::upper_bound(
-    haystack, needles, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::upper_bound(haystack, needles, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 1bb1987198d..a69dc9bf2f8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -626,6 +626,7 @@ ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/search_test.cpp b/cpp/tests/streams/search_test.cpp
new file mode 100644
index 00000000000..fbe17fb0cc4
--- /dev/null
+++ b/cpp/tests/streams/search_test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/search.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class SearchTest : public cudf::test::BaseFixture {};
+
+TEST_F(SearchTest, LowerBound)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> column{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int32_t> values{0, 7, 10, 11, 30, 32, 40, 47, 50, 90};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expect{0, 0, 0, 1, 2, 3, 3, 4, 4, 5};
+
+  cudf::lower_bound({cudf::table_view{{column}}},
+                    {cudf::table_view{{values}}},
+                    {cudf::order::ASCENDING},
+                    {cudf::null_order::BEFORE},
+                    cudf::test::get_default_stream());
+}
+
+TEST_F(SearchTest, UpperBound)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> column{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int32_t> values{0, 7, 10, 11, 30, 32, 40, 47, 50, 90};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expect{0, 0, 0, 1, 2, 3, 3, 4, 4, 5};
+
+  cudf::upper_bound({cudf::table_view{{column}}},
+                    {cudf::table_view{{values}}},
+                    {cudf::order::ASCENDING},
+                    {cudf::null_order::BEFORE},
+                    cudf::test::get_default_stream());
+}
+
+TEST_F(SearchTest, ContainsScalar)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> column{0, 1, 17, 19, 23, 29, 71};
+  cudf::numeric_scalar<int32_t> scalar{23, true, cudf::test::get_default_stream()};
+
+  cudf::contains(column, scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(SearchTest, ContainsColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> haystack{0, 1, 17, 19, 23, 29, 71};
+  cudf::test::fixed_width_column_wrapper<int32_t> needles{17, 19, 45, 72};
+
+  cudf::test::fixed_width_column_wrapper<bool> expect{1, 1, 0, 0};
+
+  cudf::contains(haystack, needles, cudf::test::get_default_stream());
+}

From 0190c2921d0278f80328240b76a22e6628cb24f7 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 6 Sep 2023 14:41:46 -1000
Subject: [PATCH 150/230] Raise TypeError for any non-parseable argument in
 to_datetime (#14044)

Avoids the following incorrect behavior

```python
In [7]: cudf.to_datetime([True])
Out[7]: GenericIndex([True], dtype='bool')

In [1]: import pandas

In [2]: pandas.to_datetime([True])
TypeError: <class 'bool'> is not convertible to datetime
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14044
---
 python/cudf/cudf/core/tools/datetimes.py | 16 ++++++++--------
 python/cudf/cudf/tests/test_datetime.py  |  6 ++++++
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index a759f9dc3e1..f736e055163 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -294,12 +294,8 @@ def to_datetime(
 def _process_col(col, unit, dayfirst, infer_datetime_format, format):
     if col.dtype.kind == "M":
         return col
-    elif col.dtype.kind == "m":
-        raise TypeError(
-            f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
-        )
 
-    if col.dtype.kind in ("f"):
+    elif col.dtype.kind in ("f"):
         if unit not in (None, "ns"):
             factor = cudf.Scalar(
                 column.datetime._unit_to_nanoseconds_conversion[unit]
@@ -325,8 +321,9 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             )
         else:
             col = col.as_datetime_column(dtype="datetime64[ns]")
+        return col
 
-    if col.dtype.kind in ("i"):
+    elif col.dtype.kind in ("i"):
         if unit in ("D", "h", "m"):
             factor = cudf.Scalar(
                 column.datetime._unit_to_nanoseconds_conversion[unit]
@@ -340,6 +337,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             )
         else:
             col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
+        return col
 
     elif col.dtype.kind in ("O"):
         if unit not in (None, "ns") or col.null_count == len(col):
@@ -364,11 +362,13 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
                 format = column.datetime.infer_format(
                     element=col.element_indexing(0)
                 )
-            col = col.as_datetime_column(
+            return col.as_datetime_column(
                 dtype=_unit_dtype_map[unit],
                 format=format,
             )
-    return col
+    raise TypeError(
+        f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
+    )
 
 
 def get_units(value):
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 4a4e9b67c2e..4c20258ae67 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2156,3 +2156,9 @@ def test_format_timezone_not_implemented(code):
         cudf.to_datetime(
             ["2020-01-01 00:00:00 UTC"], format=f"%Y-%m-%d %H:%M:%S %{code}"
         )
+
+
+@pytest.mark.parametrize("arg", [True, False])
+def test_args_not_datetime_typerror(arg):
+    with pytest.raises(TypeError):
+        cudf.to_datetime([arg])

From dd6553a22d6cfcc2f017775a57d7b49783d62a9c Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 7 Sep 2023 16:07:23 +1000
Subject: [PATCH 151/230] Ignore compile_commands.json (#14048)

Fixes #14047

Adds compile_commands.json to .gitignore.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14048
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index fb5c301fe3f..a9bf0854d65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ DartConfiguration.tcl
 *.spec
 .nfs*
 .clangd
+compile_commands.json
 
 ## Python build directories & artifacts
 dask-worker-space/

From 7331922486c0e5f1e6a765efa8063aa7603c7add Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 7 Sep 2023 09:59:52 -0500
Subject: [PATCH 152/230] Raise `NotImplementedError` for
 `MultiIndex.to_series` (#14049)

Fixes #14035

This PR raises an error for `MultiIndex.to_series` because we cannot store `tuple` type columns in `cudf`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14049
---
 python/cudf/cudf/core/multiindex.py       | 6 ++++++
 python/cudf/cudf/tests/test_multiindex.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 12da69740d8..bc6726879c1 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -219,6 +219,12 @@ def names(self, value):
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
+    @_cudf_nvtx_annotate
+    def to_series(self, index=None, name=None):
+        raise NotImplementedError(
+            "MultiIndex.to_series isn't implemented yet."
+        )
+
     @_cudf_nvtx_annotate
     def astype(self, dtype, copy: bool = True):
         if not is_object_dtype(dtype):
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 56bd7d709b7..3c843ace0a8 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1920,3 +1920,9 @@ def test_multiindex_sort_index_partial(levels):
     expect = df.sort_index(level=levels, sort_remaining=True)
     got = cdf.sort_index(level=levels, sort_remaining=True)
     assert_eq(expect, got)
+
+
+def test_multiindex_to_series_error():
+    midx = cudf.MultiIndex.from_tuples([("a", "b")])
+    with pytest.raises(NotImplementedError):
+        midx.to_series()

From dc5f5006b1e7c9d5ca3649188833e0a3b44cc841 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 7 Sep 2023 10:01:37 -0500
Subject: [PATCH 153/230] Fix `IntervalIndex.union` to preserve type-metadata
 (#14051)

Fixes: #14041

This PR fixes `fillna` that will preserve the type-metadata for `IntervalColumn`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14051
---
 python/cudf/cudf/core/column/categorical.py | 13 +------------
 python/cudf/cudf/core/column/column.py      |  2 +-
 python/cudf/cudf/core/column/decimal.py     |  5 +----
 python/cudf/cudf/tests/test_index.py        |  4 ++++
 4 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index eaffc18db70..5be609c81bc 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1272,18 +1272,7 @@ def fillna(
                     self.codes.dtype
                 )
 
-        result = super().fillna(value=fill_value, method=method)
-
-        result = column.build_categorical_column(
-            categories=self.dtype.categories._values,
-            codes=column.build_column(result.base_data, dtype=result.dtype),
-            offset=result.offset,
-            size=result.size,
-            mask=result.base_mask,
-            ordered=self.dtype.ordered,
-        )
-
-        return result
+        return super().fillna(value=fill_value, method=method)
 
     def indices_of(
         self, value: ScalarLike
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9dde17a1045..a8735a1dd8d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -715,7 +715,7 @@ def fillna(
         """
         return libcudf.replace.replace_nulls(
             input_col=self, replacement=value, method=method, dtype=dtype
-        )
+        )._with_type_metadata(self.dtype)
 
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index a8a707ec805..5a823c5f7c3 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -147,10 +147,7 @@ def fillna(
                 "integer values"
             )
 
-        result = libcudf.replace.replace_nulls(
-            input_col=self, replacement=value, method=method, dtype=dtype
-        )
-        return result._with_type_metadata(self.dtype)
+        return super().fillna(value=value, method=method)
 
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 359b3c519de..5730ecc4ae7 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2050,6 +2050,10 @@ def test_range_index_concat(objs):
         (pd.Index([0, 1, 2, 30], name="a"), [90, 100]),
         (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])),
         (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])),
+        (
+            pd.IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4)]),
+            pd.IntervalIndex.from_tuples([(0, 2), (2, 4)]),
+        ),
     ],
 )
 @pytest.mark.parametrize("sort", [None, False])

From 6945c4f8b9a0f8497b1f9f662a2015bdc4992048 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 7 Sep 2023 12:04:23 -0500
Subject: [PATCH 154/230] Raise `MixedTypeError` when a column of mixed-dtype
 is being constructed (#14050)

Fixes #14038

This PR introduces changes that raise an error when a column of `object` dtype is being constructed when the data is not string or bools.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14050
---
 python/cudf/cudf/core/column/column.py | 19 ++++++++++++++-----
 python/cudf/cudf/tests/test_index.py   |  3 ++-
 python/cudf/cudf/tests/test_parquet.py |  4 ++--
 python/cudf/cudf/tests/test_series.py  |  6 +++++-
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a8735a1dd8d..b4ad6765207 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2062,10 +2062,15 @@ def as_column(
             )
         else:
             pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
-            if arbitrary.dtype == cudf.dtype("object") and isinstance(
-                pyarrow_array, (pa.DurationArray, pa.TimestampArray)
+            if (
+                arbitrary.dtype == cudf.dtype("object")
+                and cudf.dtype(pyarrow_array.type.to_pandas_dtype())
+                != cudf.dtype(arbitrary.dtype)
+                and not is_bool_dtype(
+                    cudf.dtype(pyarrow_array.type.to_pandas_dtype())
+                )
             ):
-                raise TypeError("Cannot create column with mixed types")
+                raise MixedTypeError("Cannot create column with mixed types")
             if isinstance(pyarrow_array.type, pa.Decimal128Type):
                 pyarrow_type = cudf.Decimal128Dtype.from_arrow(
                     pyarrow_array.type
@@ -2436,8 +2441,12 @@ def as_column(
                 if (
                     isinstance(arbitrary, pd.Index)
                     and arbitrary.dtype == cudf.dtype("object")
-                    and isinstance(
-                        pyarrow_array, (pa.DurationArray, pa.TimestampArray)
+                    and (
+                        cudf.dtype(pyarrow_array.type.to_pandas_dtype())
+                        != cudf.dtype(arbitrary.dtype)
+                        and not is_bool_dtype(
+                            cudf.dtype(pyarrow_array.type.to_pandas_dtype())
+                        )
                     )
                 ):
                     raise MixedTypeError(
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 5730ecc4ae7..819527ac312 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2676,10 +2676,11 @@ def test_scalar_getitem(self, index_values, i):
             12,
             20,
         ],
+        [1, 2, 3, 4],
     ],
 )
 def test_index_mixed_dtype_error(data):
-    pi = pd.Index(data)
+    pi = pd.Index(data, dtype="object")
     with pytest.raises(TypeError):
         cudf.Index(pi)
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 66c4a253423..b892cc62ac4 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2374,11 +2374,11 @@ def test_parquet_writer_list_statistics(tmpdir):
         for i, col in enumerate(pd_slice):
             stats = pq_file.metadata.row_group(rg).column(i).statistics
 
-            actual_min = cudf.Series(pd_slice[col].explode().explode()).min()
+            actual_min = pd_slice[col].explode().explode().dropna().min()
             stats_min = stats.min
             assert normalized_equals(actual_min, stats_min)
 
-            actual_max = cudf.Series(pd_slice[col].explode().explode()).max()
+            actual_max = pd_slice[col].explode().explode().dropna().max()
             stats_max = stats.max
             assert normalized_equals(actual_max, stats_max)
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 51c6bb1634d..783d7d31d7f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2187,11 +2187,15 @@ def test_series_init_error():
     )
 
 
-@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+@pytest.mark.parametrize(
+    "dtype", ["datetime64[ns]", "timedelta64[ns]", "object", "str"]
+)
 def test_series_mixed_dtype_error(dtype):
     ps = pd.concat([pd.Series([1, 2, 3], dtype=dtype), pd.Series([10, 11])])
     with pytest.raises(TypeError):
         cudf.Series(ps)
+    with pytest.raises(TypeError):
+        cudf.Series(ps.array)
 
 
 @pytest.mark.parametrize("data", [[True, False, None], [10, 200, 300]])

From c9d88219ce6e920b8fad977ade437bf87d1d5099 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 7 Sep 2023 12:34:51 -0500
Subject: [PATCH 155/230] Fix empty string column construction (#14052)

Fixes #14046

This PR fixes empty string column construction that arises due to a corner-case in the way pyarrow constructs arrays.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14052
---
 python/cudf/cudf/core/column/column.py   | 15 +++++++++++++++
 python/cudf/cudf/tests/test_dataframe.py |  5 +----
 python/cudf/cudf/tests/test_index.py     | 24 ++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b4ad6765207..59ab3569814 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2438,6 +2438,21 @@ def as_column(
                     from_pandas=True if nan_as_null is None else nan_as_null,
                 )
 
+                if (
+                    isinstance(pyarrow_array, pa.NullArray)
+                    and pa_type is None
+                    and dtype is None
+                    and getattr(arbitrary, "dtype", None)
+                    == cudf.dtype("object")
+                ):
+                    # pa.array constructor returns a NullArray
+                    # for empty arrays, instead of a StringArray.
+                    # This issue is only specific to this dtype,
+                    # all other dtypes, result in their corresponding
+                    # arrow array creation.
+                    dtype = cudf.dtype("str")
+                    pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype))
+
                 if (
                     isinstance(arbitrary, pd.Index)
                     and arbitrary.dtype == cudf.dtype("object")
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 3c84cfe48c4..44d0b9249d0 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -7256,10 +7256,7 @@ def test_dataframe_keys(df):
 def test_series_keys(ps):
     gds = cudf.from_pandas(ps)
 
-    if len(ps) == 0 and not isinstance(ps.index, pd.RangeIndex):
-        assert_eq(ps.keys().astype("float64"), gds.keys())
-    else:
-        assert_eq(ps.keys(), gds.keys())
+    assert_eq(ps.keys(), gds.keys())
 
 
 @pytest_unmark_spilling
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 819527ac312..506edd5b3f3 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -20,6 +20,7 @@
     as_index,
 )
 from cudf.testing._utils import (
+    ALL_TYPES,
     FLOAT_TYPES,
     NUMERIC_TYPES,
     OTHER_TYPES,
@@ -2703,3 +2704,26 @@ def test_index_getitem_time_duration(dtype):
                 assert gidx[i] is pidx[i]
             else:
                 assert_eq(gidx[i], pidx[i])
+
+
+@pytest.mark.parametrize("dtype", ALL_TYPES)
+def test_index_empty_from_pandas(request, dtype):
+    request.node.add_marker(
+        pytest.mark.xfail(
+            condition=not PANDAS_GE_200
+            and dtype
+            in {
+                "datetime64[ms]",
+                "datetime64[s]",
+                "datetime64[us]",
+                "timedelta64[ms]",
+                "timedelta64[s]",
+                "timedelta64[us]",
+            },
+            reason="Fixed in pandas-2.0",
+        )
+    )
+    pidx = pd.Index([], dtype=dtype)
+    gidx = cudf.from_pandas(pidx)
+
+    assert_eq(pidx, gidx)

From b4da39cfbe569e290ae42ca9cf8ff868d5788757 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 7 Sep 2023 14:11:08 -0700
Subject: [PATCH 156/230] Use thread_index_type to avoid out of bounds accesses
 in conditional joins (#13971)

See #10368 (and more recently #13771

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13971
---
 cpp/src/join/conditional_join_kernels.cuh | 41 ++++++++++++-----------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index dc455ad9cef..f665aba698f 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -67,23 +67,25 @@ __global__ void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
-  cudf::size_type const stride         = block_size * gridDim.x;
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-  auto const inner_num_rows            = (swap_tables ? left_num_rows : right_num_rows);
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
+
+  cudf::thread_index_type const left_num_rows  = left_table.num_rows();
+  cudf::thread_index_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows                    = (swap_tables ? right_num_rows : left_num_rows);
+  auto const inner_num_rows                    = (swap_tables ? left_num_rows : right_num_rows);
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
-  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+  for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
        outer_row_index += stride) {
     bool found_match = false;
-    for (cudf::size_type inner_row_index = 0; inner_row_index < inner_num_rows; inner_row_index++) {
-      auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
-      auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
-      auto const right_row_index = swap_tables ? outer_row_index : inner_row_index;
+    for (cudf::thread_index_type inner_row_index = 0; inner_row_index < inner_num_rows;
+         ++inner_row_index) {
+      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+      cudf::size_type const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
+      cudf::size_type const right_row_index = swap_tables ? outer_row_index : inner_row_index;
       evaluator.evaluate(
         output_dest, left_row_index, right_row_index, 0, thread_intermediate_storage);
       if (output_dest.is_valid() && output_dest.value()) {
@@ -161,18 +163,18 @@ __global__ void conditional_join(table_device_view left_table,
   auto thread_intermediate_storage =
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
-  int const warp_id                    = threadIdx.x / detail::warp_size;
-  int const lane_id                    = threadIdx.x % detail::warp_size;
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-  auto const inner_num_rows            = (swap_tables ? left_num_rows : right_num_rows);
+  int const warp_id                            = threadIdx.x / detail::warp_size;
+  int const lane_id                            = threadIdx.x % detail::warp_size;
+  cudf::thread_index_type const left_num_rows  = left_table.num_rows();
+  cudf::thread_index_type const right_num_rows = right_table.num_rows();
+  cudf::thread_index_type const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows);
+  cudf::thread_index_type const inner_num_rows = (swap_tables ? left_num_rows : right_num_rows);
 
   if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
 
   __syncwarp();
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+  auto outer_row_index = cudf::detail::grid_1d::global_thread_id();
 
   unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows);
 
@@ -181,7 +183,8 @@ __global__ void conditional_join(table_device_view left_table,
 
   if (outer_row_index < outer_num_rows) {
     bool found_match = false;
-    for (size_type inner_row_index(0); inner_row_index < inner_num_rows; ++inner_row_index) {
+    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+         ++inner_row_index) {
       auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
       auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
       auto const right_row_index = swap_tables ? outer_row_index : inner_row_index;

From b2ab2566c155b4b753b14e5b5c013653b701148d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 8 Sep 2023 02:22:00 -0700
Subject: [PATCH 157/230] Update doxygen to 1.9.1 (#14059)

I selected this version as it is what ships with Ubuntu 22.04. I also ran `doxygen -u` to update the Doxyfile.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Divye Gala (https://github.com/divyegala)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14059
---
 ci/checks/doxygen.sh                          |   8 +-
 .../all_cuda-118_arch-x86_64.yaml             |   2 +-
 .../all_cuda-120_arch-x86_64.yaml             |   2 +-
 cpp/doxygen/Doxyfile                          | 164 ++++++++++++++----
 dependencies.yaml                             |   4 +-
 5 files changed, 134 insertions(+), 46 deletions(-)

diff --git a/ci/checks/doxygen.sh b/ci/checks/doxygen.sh
index f260fbcd1a4..d932fa097e9 100755
--- a/ci/checks/doxygen.sh
+++ b/ci/checks/doxygen.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 ###############################
 # cuDF doxygen warnings check #
 ###############################
@@ -13,11 +13,11 @@ fi
 # Utility to return version as number for comparison
 function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; }
 
-# doxygen supported version 1.8.20 to 1.9.1
+# doxygen supported version 1.9.1
 DOXYGEN_VERSION=`doxygen --version`
-if [ $(version "$DOXYGEN_VERSION") -lt $(version "1.8.20") ] ||  [ $(version $DOXYGEN_VERSION) -gt $(version "1.9.1") ]; then
+if [ ! $(version "$DOXYGEN_VERSION") -eq $(version "1.9.1") ] ; then
   echo -e "warning: Unsupported doxygen version $DOXYGEN_VERSION"
-  echo -e "Expecting doxygen version from 1.8.20 to 1.9.1"
+  echo -e "Expecting doxygen version 1.9.1"
   exit 0
 fi
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8965a43b8ac..692ba78f317 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - dask>=2023.7.1
 - distributed>=2023.7.1
 - dlpack>=0.5,<0.6.0a0
-- doxygen=1.8.20
+- doxygen=1.9.1
 - fastavro>=0.22.9
 - fmt>=9.1.0,<10
 - fsspec>=0.6.0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 4542eb79267..cf1bf4b8733 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -30,7 +30,7 @@ dependencies:
 - dask>=2023.7.1
 - distributed>=2023.7.1
 - dlpack>=0.5,<0.6.0a0
-- doxygen=1.8.20
+- doxygen=1.9.1
 - fastavro>=0.22.9
 - fmt>=9.1.0,<10
 - fsspec>=0.6.0
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 357daed243b..b072d252881 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.20
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "libcudf"
+PROJECT_NAME           = libcudf
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -305,7 +313,10 @@ OPTIMIZE_OUTPUT_SLICE  = NO
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      = cu=C++ \
                          cuh=C++
@@ -516,6 +527,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -553,11 +571,18 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# (including Cygwin) and Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
@@ -796,7 +821,10 @@ WARN_IF_DOC_ERROR      = YES
 WARN_NO_PARAMDOC       = YES
 
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
 # The default value is: NO.
 
 WARN_AS_ERROR          = NO
@@ -846,8 +874,8 @@ INPUT                  = main_page.md \
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -860,13 +888,15 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
 # *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
-# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen
-# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
-# *.vhdl, *.ucf, *.qsf and *.ice.
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.cpp \
                          *.hpp \
@@ -1270,10 +1300,11 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/xcode/), introduced with OSX
-# 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
 # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
 # genXcode/_index.html for more information.
@@ -1315,8 +1346,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1391,7 +1422,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1399,8 +1431,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1408,16 +1440,16 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
@@ -1429,9 +1461,9 @@ QHP_CUST_FILTER_ATTRS  =
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1558,7 +1590,7 @@ USE_MATHJAX            = NO
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
@@ -1588,7 +1620,8 @@ MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1635,7 +1668,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1648,8 +1682,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1839,6 +1874,16 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
 # https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
@@ -1919,6 +1964,16 @@ RTF_STYLESHEET_FILE    =
 
 RTF_EXTENSIONS_FILE    =
 
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -2015,6 +2070,15 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
@@ -2301,10 +2365,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2494,9 +2580,11 @@ DOT_MULTI_TARGETS      = NO
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/dependencies.yaml b/dependencies.yaml
index 97f86c6b864..f99b7404854 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -375,13 +375,13 @@ dependencies:
           - identify>=2.5.20
       - output_types: conda
         packages:
-          - doxygen=1.8.20 # pre-commit hook needs a specific version.
+          - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
   docs:
     common:
       - output_types: [conda]
         packages:
           - dask-cuda==23.10.*
-          - doxygen=1.8.20
+          - *doxygen
           - make
           - myst-nb
           - nbsphinx

From e43809ea9f9ba2015ebab3eb4d2b9ca7dfa72849 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 8 Sep 2023 17:47:41 +0100
Subject: [PATCH 158/230] Use `conda mambabuild` rather than `mamba mambabuild`
 (#14067)

Since Conda 23.7.3, the plugin mechanism changed, and mambabuild broke.

Since, with `boa` installed, `conda mambabuild` uses the `libmamba` solver, switch to that.

The general handling of subcommands with `mamba` was partially fixed in mamba-org/mamba#2732, but `mamba build` does not currently work due to mamba-org/mamba#2821.

- Closes #14068

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14067
---
 ci/build_cpp.sh    | 3 ++-
 ci/build_python.sh | 9 +++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 3bd18a88139..8b757fecf5a 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -11,7 +11,8 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-rapids-mamba-retry mambabuild \
+# With boa installed conda build forward to boa
+rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index ec34d63b282..61f160b25f5 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -15,24 +15,25 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
-rapids-mamba-retry mambabuild \
+# With boa installed conda build forwards to the boa builder
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cudf
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/dask-cudf
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \

From 01730c46a4f403fd5cf9245512c941176eef2428 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 8 Sep 2023 13:36:48 -0500
Subject: [PATCH 159/230] Fix `Index.difference` to match with pandas (#14053)

This PR fixes `Index.difference` in following ways:

- [x] Fixes `name` preservation by correctly evaluating the name of two input objects, closes #14019
- [x] Fixes `is_mixed_with_object_dtype` handling that will resolve incorrect results for `CategoricalIndex`, closes #14022
- [x] Raises errors for invalid input types, the error messages are an exact match to pandas error messages for parity.
- [x] Introduce a `Range._try_reconstruct_range_index` that will try to re-construct a `RangeIndex` out of an `Int..Index` to save memory- this is on parity with pandas. closes #14013

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14053
---
 python/cudf/cudf/core/_base_index.py | 12 ++++++++++--
 python/cudf/cudf/core/index.py       | 22 ++++++++++++++++++++++
 python/cudf/cudf/tests/test_index.py | 21 +++++++++++++++++++++
 python/cudf/cudf/utils/dtypes.py     |  5 +++++
 4 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 829ca33d8a5..8091f3f7dd2 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -30,7 +30,7 @@
 from cudf.core.column import ColumnBase, column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils import ioutils
-from cudf.utils.dtypes import is_mixed_with_object_dtype
+from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype
 from cudf.utils.utils import _is_same_name
 
 
@@ -935,13 +935,21 @@ def difference(self, other, sort=None):
         >>> idx1.difference(idx2, sort=False)
         Int64Index([2, 1], dtype='int64')
         """
+        if not can_convert_to_column(other):
+            raise TypeError("Input must be Index or array-like")
+
         if sort not in {None, False}:
             raise ValueError(
                 f"The 'sort' keyword only takes the values "
                 f"of None or False; {sort} was passed."
             )
 
-        other = cudf.Index(other)
+        other = cudf.Index(other, name=getattr(other, "name", self.name))
+
+        if not len(other):
+            return self._get_reconciled_name_object(other)
+        elif self.equals(other):
+            return self[:0]._get_reconciled_name_object(other)
 
         res_name = _get_result_name(self.name, other.name)
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index c7e25cdc430..4bb5428838f 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -724,6 +724,28 @@ def _intersection(self, other, sort=False):
 
         return new_index
 
+    @_cudf_nvtx_annotate
+    def difference(self, other, sort=None):
+        if isinstance(other, RangeIndex) and self.equals(other):
+            return self[:0]._get_reconciled_name_object(other)
+
+        return self._try_reconstruct_range_index(
+            super().difference(other, sort=sort)
+        )
+
+    def _try_reconstruct_range_index(self, index):
+        if isinstance(index, RangeIndex) or index.dtype.kind == "f":
+            return index
+        # Evenly spaced values can return a
+        # RangeIndex instead of a materialized Index.
+        if not index._column.has_nulls():
+            uniques = cupy.unique(cupy.diff(index.values))
+            if len(uniques) == 1 and uniques[0].get() != 0:
+                diff = uniques[0].get()
+                new_range = range(index[0], index[-1] + diff, diff)
+                return type(self)(new_range, name=index.name)
+        return index
+
     def sort_values(
         self,
         return_indexer=False,
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 506edd5b3f3..58dbc48e31e 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -789,6 +789,10 @@ def test_index_to_series(data):
         ["5", "6", "2", "a", "b", "c"],
         [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
         [1.0, 5.0, 6.0, 0.0, 1.3],
+        ["ab", "cd", "ef"],
+        pd.Series(["1", "2", "a", "3", None], dtype="category"),
+        range(0, 10),
+        [],
     ],
 )
 @pytest.mark.parametrize(
@@ -799,8 +803,11 @@ def test_index_to_series(data):
         [10, 20, 30, 40, 50, 60],
         ["1", "2", "3", "4", "5", "6"],
         ["5", "6", "2", "a", "b", "c"],
+        ["ab", "ef", None],
         [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
         [1.0, 5.0, 6.0, 0.0, 1.3],
+        range(2, 4),
+        pd.Series(["1", "a", "3", None], dtype="category"),
         [],
     ],
 )
@@ -818,9 +825,23 @@ def test_index_difference(data, other, sort, name_data, name_other):
 
     expected = pd_data.difference(pd_other, sort=sort)
     actual = gd_data.difference(gd_other, sort=sort)
+
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize("other", ["a", 1, None])
+def test_index_difference_invalid_inputs(other):
+    pdi = pd.Index([1, 2, 3])
+    gdi = cudf.Index([1, 2, 3])
+
+    assert_exceptions_equal(
+        pdi.difference,
+        gdi.difference,
+        ([other], {}),
+        ([other], {}),
+    )
+
+
 def test_index_difference_sort_error():
     pdi = pd.Index([1, 2, 3])
     gdi = cudf.Index([1, 2, 3])
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index ea96a0859ce..e50457b8e7b 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -426,6 +426,11 @@ def get_min_float_dtype(col):
 
 
 def is_mixed_with_object_dtype(lhs, rhs):
+    if cudf.api.types.is_categorical_dtype(lhs.dtype):
+        return is_mixed_with_object_dtype(lhs.dtype.categories, rhs)
+    elif cudf.api.types.is_categorical_dtype(rhs.dtype):
+        return is_mixed_with_object_dtype(lhs, rhs.dtype.categories)
+
     return (lhs.dtype == "object" and rhs.dtype != "object") or (
         rhs.dtype == "object" and lhs.dtype != "object"
     )

From 36ee11a719645feec6d5bcf089ac3a3ac20cb621 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Fri, 8 Sep 2023 17:37:13 -0400
Subject: [PATCH 160/230] Remove header tests (#14072)

From some internal Slack discussions, it was determined that the `headers_test.sh` file is no longer necessary.

This PR removes it and its associated checks.

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14072
---
 .pre-commit-config.yaml         |  12 --
 ci/checks/headers_test.sh       |  25 ----
 conda/recipes/libcudf/meta.yaml | 242 --------------------------------
 3 files changed, 279 deletions(-)
 delete mode 100755 ci/checks/headers_test.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b5165cf026f..238e5b44030 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -128,18 +128,6 @@ repos:
                 language: system
                 pass_filenames: false
                 verbose: true
-              - id: headers-recipe-check
-                name: headers-recipe-check
-                entry: ./ci/checks/headers_test.sh
-                files: |
-                  (?x)^(
-                    ^cpp/include/|
-                    ^conda/.*/meta.yaml
-                  )
-                types_or: [file]
-                language: system
-                pass_filenames: false
-                verbose: false
       - repo: https://github.com/codespell-project/codespell
         rev: v2.2.2
         hooks:
diff --git a/ci/checks/headers_test.sh b/ci/checks/headers_test.sh
deleted file mode 100755
index b859009a8c5..00000000000
--- a/ci/checks/headers_test.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#####################################
-# conda existence test for headers  #
-#####################################
-
-RETVAL=0
-LIBNAME=cudf
-DIRNAMES="cudf cudf_test"
-
-# existence tests for lib${LIBNAME}
-for DIRNAME in ${DIRNAMES[@]}; do
-    HEADERS=`cd cpp && find include/${DIRNAME} -type f \( -iname "*.h" -o  -iname "*.hpp" \) -print | sed 's|^|        - test -f $PREFIX/|' | sort`
-    META_TESTS=`grep -E "test -f .*/include/${DIRNAME}/.*\.h(pp)?" conda/recipes/lib${LIBNAME}/meta.yaml | sort`
-    HEADER_DIFF=`diff <(echo "$HEADERS") <(echo "$META_TESTS")`
-    LIB_RETVAL=$?
-
-    if [ "$LIB_RETVAL" != "0" ]; then
-        echo -e ">>>> FAILED: lib${LIBNAME} has different headers in include/${DIRNAME}/ and conda/recipes/lib${LIBNAME}/meta.yaml. The diff is shown below:"
-        echo -e "$HEADER_DIFF"
-        RETVAL=1
-    fi
-done
-
-exit $RETVAL
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index c844131ad31..627065817ba 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -110,249 +110,7 @@ outputs:
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
-        - test -f $PREFIX/lib/libcudftestutil.a
-        - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_cudf.so
-        - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_testing.so
-        - test -f $PREFIX/include/cudf/aggregation.hpp
-        - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp
-        - test -f $PREFIX/include/cudf/ast/detail/expression_transformer.hpp
-        - test -f $PREFIX/include/cudf/ast/detail/operators.hpp
-        - test -f $PREFIX/include/cudf/ast/expressions.hpp
-        - test -f $PREFIX/include/cudf/binaryop.hpp
         - test -f $PREFIX/include/cudf/column/column.hpp
-        - test -f $PREFIX/include/cudf/column/column_factories.hpp
-        - test -f $PREFIX/include/cudf/column/column_view.hpp
-        - test -f $PREFIX/include/cudf/concatenate.hpp
-        - test -f $PREFIX/include/cudf/contiguous_split.hpp
-        - test -f $PREFIX/include/cudf/copying.hpp
-        - test -f $PREFIX/include/cudf/datetime.hpp
-        - test -f $PREFIX/include/cudf/timezone.hpp
-        - test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp
-        - test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp
-        - test -f $PREFIX/include/cudf/detail/binaryop.hpp
-        - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
-        - test -f $PREFIX/include/cudf/detail/concatenate.hpp
-        - test -f $PREFIX/include/cudf/detail/concatenate_masks.hpp
-        - test -f $PREFIX/include/cudf/detail/contiguous_split.hpp
-        - test -f $PREFIX/include/cudf/detail/copy.hpp
-        - test -f $PREFIX/include/cudf/detail/datetime.hpp
-        - test -f $PREFIX/include/cudf/detail/fill.hpp
-        - test -f $PREFIX/include/cudf/detail/gather.hpp
-        - test -f $PREFIX/include/cudf/detail/groupby.hpp
-        - test -f $PREFIX/include/cudf/detail/groupby/group_replace_nulls.hpp
-        - test -f $PREFIX/include/cudf/detail/groupby/sort_helper.hpp
-        - test -f $PREFIX/include/cudf/detail/interop.hpp
-        - test -f $PREFIX/include/cudf/detail/is_element_valid.hpp
-        - test -f $PREFIX/include/cudf/detail/join.hpp
-        - test -f $PREFIX/include/cudf/detail/label_bins.hpp
-        - test -f $PREFIX/include/cudf/detail/null_mask.hpp
-        - test -f $PREFIX/include/cudf/detail/nvtx/nvtx3.hpp
-        - test -f $PREFIX/include/cudf/detail/nvtx/ranges.hpp
-        - test -f $PREFIX/include/cudf/detail/quantiles.hpp
-        - test -f $PREFIX/include/cudf/detail/repeat.hpp
-        - test -f $PREFIX/include/cudf/detail/replace.hpp
-        - test -f $PREFIX/include/cudf/detail/reshape.hpp
-        - test -f $PREFIX/include/cudf/detail/rolling.hpp
-        - test -f $PREFIX/include/cudf/detail/round.hpp
-        - test -f $PREFIX/include/cudf/detail/scan.hpp
-        - test -f $PREFIX/include/cudf/detail/scatter.hpp
-        - test -f $PREFIX/include/cudf/detail/search.hpp
-        - test -f $PREFIX/include/cudf/detail/sequence.hpp
-        - test -f $PREFIX/include/cudf/detail/sorting.hpp
-        - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
-        - test -f $PREFIX/include/cudf/detail/structs/utilities.hpp
-        - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
-        - test -f $PREFIX/include/cudf/detail/timezone.cuh
-        - test -f $PREFIX/include/cudf/detail/timezone.hpp
-        - test -f $PREFIX/include/cudf/detail/transform.hpp
-        - test -f $PREFIX/include/cudf/detail/transpose.hpp
-        - test -f $PREFIX/include/cudf/detail/unary.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/alignment.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/default_stream.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h
-        - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/logger.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/pinned_host_vector.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/stacktrace.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp
-        - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp
-        - test -f $PREFIX/include/cudf/dictionary/detail/encode.hpp
-        - test -f $PREFIX/include/cudf/dictionary/detail/merge.hpp
-        - test -f $PREFIX/include/cudf/dictionary/detail/replace.hpp
-        - test -f $PREFIX/include/cudf/dictionary/detail/search.hpp
-        - test -f $PREFIX/include/cudf/dictionary/detail/update_keys.hpp
-        - test -f $PREFIX/include/cudf/dictionary/dictionary_column_view.hpp
-        - test -f $PREFIX/include/cudf/dictionary/dictionary_factories.hpp
-        - test -f $PREFIX/include/cudf/dictionary/encode.hpp
-        - test -f $PREFIX/include/cudf/dictionary/search.hpp
-        - test -f $PREFIX/include/cudf/dictionary/update_keys.hpp
-        - test -f $PREFIX/include/cudf/filling.hpp
-        - test -f $PREFIX/include/cudf/fixed_point/fixed_point.hpp
-        - test -f $PREFIX/include/cudf/fixed_point/temporary.hpp
-        - test -f $PREFIX/include/cudf/groupby.hpp
-        - test -f $PREFIX/include/cudf/hashing.hpp
-        - test -f $PREFIX/include/cudf/hashing/detail/hashing.hpp
-        - test -f $PREFIX/include/cudf/interop.hpp
-        - test -f $PREFIX/include/cudf/io/arrow_io_source.hpp
-        - test -f $PREFIX/include/cudf/io/avro.hpp
-        - test -f $PREFIX/include/cudf/io/csv.hpp
-        - test -f $PREFIX/include/cudf/io/data_sink.hpp
-        - test -f $PREFIX/include/cudf/io/datasource.hpp
-        - test -f $PREFIX/include/cudf/io/detail/avro.hpp
-        - test -f $PREFIX/include/cudf/io/detail/csv.hpp
-        - test -f $PREFIX/include/cudf/io/detail/json.hpp
-        - test -f $PREFIX/include/cudf/io/detail/tokenize_json.hpp
-        - test -f $PREFIX/include/cudf/io/detail/orc.hpp
-        - test -f $PREFIX/include/cudf/io/detail/parquet.hpp
-        - test -f $PREFIX/include/cudf/io/detail/utils.hpp
-        - test -f $PREFIX/include/cudf/io/json.hpp
-        - test -f $PREFIX/include/cudf/io/orc.hpp
-        - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
-        - test -f $PREFIX/include/cudf/io/orc_types.hpp
-        - test -f $PREFIX/include/cudf/io/parquet.hpp
-        - test -f $PREFIX/include/cudf/io/parquet_metadata.hpp
-        - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
-        - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
-        - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
-        - test -f $PREFIX/include/cudf/io/text/detail/bgzip_utils.hpp
-        - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
-        - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp
-        - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp
-        - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp
-        - test -f $PREFIX/include/cudf/io/types.hpp
-        - test -f $PREFIX/include/cudf/join.hpp
-        - test -f $PREFIX/include/cudf/labeling/label_bins.hpp
-        - test -f $PREFIX/include/cudf/lists/combine.hpp
-        - test -f $PREFIX/include/cudf/lists/contains.hpp
-        - test -f $PREFIX/include/cudf/lists/count_elements.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/combine.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/contains.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/dremel.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/extract.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/lists_column_factories.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/reverse.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/scatter_helper.cuh
-        - test -f $PREFIX/include/cudf/lists/detail/set_operations.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/stream_compaction.hpp
-        - test -f $PREFIX/include/cudf/lists/explode.hpp
-        - test -f $PREFIX/include/cudf/lists/extract.hpp
-        - test -f $PREFIX/include/cudf/lists/filling.hpp
-        - test -f $PREFIX/include/cudf/lists/gather.hpp
-        - test -f $PREFIX/include/cudf/lists/list_view.hpp
-        - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp
-        - test -f $PREFIX/include/cudf/lists/reverse.hpp
-        - test -f $PREFIX/include/cudf/lists/set_operations.hpp
-        - test -f $PREFIX/include/cudf/lists/sorting.hpp
-        - test -f $PREFIX/include/cudf/lists/stream_compaction.hpp
-        - test -f $PREFIX/include/cudf/merge.hpp
-        - test -f $PREFIX/include/cudf/null_mask.hpp
-        - test -f $PREFIX/include/cudf/partitioning.hpp
-        - test -f $PREFIX/include/cudf/quantiles.hpp
-        - test -f $PREFIX/include/cudf/reduction.hpp
-        - test -f $PREFIX/include/cudf/reduction/detail/reduction.hpp
-        - test -f $PREFIX/include/cudf/reduction/detail/reduction_functions.hpp
-        - test -f $PREFIX/include/cudf/reduction/detail/segmented_reduction_functions.hpp
-        - test -f $PREFIX/include/cudf/replace.hpp
-        - test -f $PREFIX/include/cudf/reshape.hpp
-        - test -f $PREFIX/include/cudf/rolling.hpp
-        - test -f $PREFIX/include/cudf/rolling/range_window_bounds.hpp
-        - test -f $PREFIX/include/cudf/round.hpp
-        - test -f $PREFIX/include/cudf/scalar/scalar.hpp
-        - test -f $PREFIX/include/cudf/scalar/scalar_factories.hpp
-        - test -f $PREFIX/include/cudf/search.hpp
-        - test -f $PREFIX/include/cudf/sorting.hpp
-        - test -f $PREFIX/include/cudf/stream_compaction.hpp
-        - test -f $PREFIX/include/cudf/strings/attributes.hpp
-        - test -f $PREFIX/include/cudf/strings/capitalize.hpp
-        - test -f $PREFIX/include/cudf/strings/case.hpp
-        - test -f $PREFIX/include/cudf/strings/char_types/char_cases.hpp
-        - test -f $PREFIX/include/cudf/strings/char_types/char_types.hpp
-        - test -f $PREFIX/include/cudf/strings/char_types/char_types_enum.hpp
-        - test -f $PREFIX/include/cudf/strings/combine.hpp
-        - test -f $PREFIX/include/cudf/strings/contains.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_booleans.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_datetime.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_durations.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_fixed_point.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_floats.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_lists.hpp
-        - test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/char_tables.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/combine.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/converters.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/copying.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/json.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/utf8.hpp
-        - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
-        - test -f $PREFIX/include/cudf/strings/extract.hpp
-        - test -f $PREFIX/include/cudf/strings/find.hpp
-        - test -f $PREFIX/include/cudf/strings/find_multiple.hpp
-        - test -f $PREFIX/include/cudf/strings/findall.hpp
-        - test -f $PREFIX/include/cudf/strings/json.hpp
-        - test -f $PREFIX/include/cudf/strings/padding.hpp
-        - test -f $PREFIX/include/cudf/strings/regex/flags.hpp
-        - test -f $PREFIX/include/cudf/strings/regex/regex_program.hpp
-        - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp
-        - test -f $PREFIX/include/cudf/strings/replace.hpp
-        - test -f $PREFIX/include/cudf/strings/replace_re.hpp
-        - test -f $PREFIX/include/cudf/strings/reverse.hpp
-        - test -f $PREFIX/include/cudf/strings/side_type.hpp
-        - test -f $PREFIX/include/cudf/strings/slice.hpp
-        - test -f $PREFIX/include/cudf/strings/split/partition.hpp
-        - test -f $PREFIX/include/cudf/strings/split/split.hpp
-        - test -f $PREFIX/include/cudf/strings/split/split_re.hpp
-        - test -f $PREFIX/include/cudf/strings/string_view.hpp
-        - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp
-        - test -f $PREFIX/include/cudf/strings/strip.hpp
-        - test -f $PREFIX/include/cudf/strings/translate.hpp
-        - test -f $PREFIX/include/cudf/strings/wrap.hpp
-        - test -f $PREFIX/include/cudf/structs/detail/concatenate.hpp
-        - test -f $PREFIX/include/cudf/structs/struct_view.hpp
-        - test -f $PREFIX/include/cudf/structs/structs_column_view.hpp
-        - test -f $PREFIX/include/cudf/table/table.hpp
-        - test -f $PREFIX/include/cudf/table/table_view.hpp
-        - test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.hpp
-        - test -f $PREFIX/include/cudf/transform.hpp
-        - test -f $PREFIX/include/cudf/transpose.hpp
-        - test -f $PREFIX/include/cudf/types.hpp
-        - test -f $PREFIX/include/cudf/unary.hpp
-        - test -f $PREFIX/include/cudf/utilities/bit.hpp
-        - test -f $PREFIX/include/cudf/utilities/default_stream.hpp
-        - test -f $PREFIX/include/cudf/utilities/error.hpp
-        - test -f $PREFIX/include/cudf/utilities/logger.hpp
-        - test -f $PREFIX/include/cudf/utilities/span.hpp
-        - test -f $PREFIX/include/cudf/utilities/traits.hpp
-        - test -f $PREFIX/include/cudf/utilities/type_checks.hpp
-        - test -f $PREFIX/include/cudf/utilities/type_dispatcher.hpp
-        - test -f $PREFIX/include/cudf/wrappers/dictionary.hpp
-        - test -f $PREFIX/include/cudf/wrappers/durations.hpp
-        - test -f $PREFIX/include/cudf/wrappers/timestamps.hpp
-        - test -f $PREFIX/include/cudf_test/base_fixture.hpp
-        - test -f $PREFIX/include/cudf_test/column_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/column_wrapper.hpp
-        - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
-        - test -f $PREFIX/include/cudf_test/cxxopts.hpp
-        - test -f $PREFIX/include/cudf_test/default_stream.hpp
-        - test -f $PREFIX/include/cudf_test/detail/column_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/file_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/stream_checking_resource_adaptor.hpp
-        - test -f $PREFIX/include/cudf_test/table_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
-        - test -f $PREFIX/include/cudf_test/type_list_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/type_lists.hpp
     about:
       home: https://rapids.ai/
       license: Apache-2.0

From 886e189e4c3cbad258563f4ec5b0f41fc6e15b5e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 8 Sep 2023 18:27:18 -0700
Subject: [PATCH 161/230] Remove the mr from the base fixture (#14057)

This mr is just an alias for the current memory resource, so we don't really need it. This came up in https://github.com/rapidsai/cudf/pull/14010#discussion_r1312405952. This PR removes all uses of it, but does not actually remove the mr yet. That will be done in a follow-up (see https://github.com/rapidsai/cudf/pull/14057#issuecomment-1712340714).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14057
---
 cpp/tests/column/factories_test.cpp   | 137 +++++++-------------------
 cpp/tests/copying/split_tests.cpp     |  17 ++--
 cpp/tests/scalar/factories_test.cpp   |  37 +++----
 cpp/tests/wrappers/timestamps_test.cu |  13 +--
 4 files changed, 64 insertions(+), 140 deletions(-)

diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 95706ad9e37..b06d097647d 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -37,7 +37,6 @@ class ColumnFactoryTest : public cudf::test::BaseFixture {
 
  public:
   cudf::size_type size() { return _size; }
-  rmm::cuda_stream_view stream() { return cudf::get_default_stream(); }
 };
 
 template <typename T>
@@ -47,11 +46,8 @@ TYPED_TEST_SUITE(NumericFactoryTest, cudf::test::NumericTypes);
 
 TYPED_TEST(NumericFactoryTest, EmptyNoMask)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                          0,
-                                          cudf::mask_state::UNALLOCATED,
-                                          this->stream(),
-                                          this->mr());
+  auto column = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, 0, cudf::mask_state::UNALLOCATED);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), 0);
   EXPECT_EQ(0, column->null_count());
@@ -62,11 +58,8 @@ TYPED_TEST(NumericFactoryTest, EmptyNoMask)
 
 TYPED_TEST(NumericFactoryTest, EmptyAllValidMask)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                          0,
-                                          cudf::mask_state::ALL_VALID,
-                                          this->stream(),
-                                          this->mr());
+  auto column = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, 0, cudf::mask_state::ALL_VALID);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), 0);
   EXPECT_EQ(0, column->null_count());
@@ -77,11 +70,8 @@ TYPED_TEST(NumericFactoryTest, EmptyAllValidMask)
 
 TYPED_TEST(NumericFactoryTest, EmptyAllNullMask)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                          0,
-                                          cudf::mask_state::ALL_NULL,
-                                          this->stream(),
-                                          this->mr());
+  auto column = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, 0, cudf::mask_state::ALL_NULL);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), 0);
   EXPECT_EQ(0, column->null_count());
@@ -92,11 +82,8 @@ TYPED_TEST(NumericFactoryTest, EmptyAllNullMask)
 
 TYPED_TEST(NumericFactoryTest, NoMask)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                          this->size(),
-                                          cudf::mask_state::UNALLOCATED,
-                                          this->stream(),
-                                          this->mr());
+  auto column = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), cudf::mask_state::UNALLOCATED);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(0, column->null_count());
@@ -107,11 +94,8 @@ TYPED_TEST(NumericFactoryTest, NoMask)
 
 TYPED_TEST(NumericFactoryTest, UnitializedMask)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                          this->size(),
-                                          cudf::mask_state::UNINITIALIZED,
-                                          this->stream(),
-                                          this->mr());
+  auto column = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), cudf::mask_state::UNINITIALIZED);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_TRUE(column->nullable());
@@ -120,11 +104,8 @@ TYPED_TEST(NumericFactoryTest, UnitializedMask)
 
 TYPED_TEST(NumericFactoryTest, AllValidMask)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                          this->size(),
-                                          cudf::mask_state::ALL_VALID,
-                                          this->stream(),
-                                          this->mr());
+  auto column = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), cudf::mask_state::ALL_VALID);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(0, column->null_count());
@@ -135,11 +116,8 @@ TYPED_TEST(NumericFactoryTest, AllValidMask)
 
 TYPED_TEST(NumericFactoryTest, AllNullMask)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                          this->size(),
-                                          cudf::mask_state::ALL_NULL,
-                                          this->stream(),
-                                          this->mr());
+  auto column = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), cudf::mask_state::ALL_NULL);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(this->size(), column->null_count());
@@ -154,9 +132,7 @@ TYPED_TEST(NumericFactoryTest, NullMaskAsParm)
   auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                           this->size(),
                                           std::move(null_mask),
-                                          this->size(),
-                                          this->stream(),
-                                          this->mr());
+                                          this->size());
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(this->size(), column->null_count());
@@ -167,12 +143,8 @@ TYPED_TEST(NumericFactoryTest, NullMaskAsParm)
 
 TYPED_TEST(NumericFactoryTest, NullMaskAsEmptyParm)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                          this->size(),
-                                          rmm::device_buffer{},
-                                          0,
-                                          this->stream(),
-                                          this->mr());
+  auto column = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), rmm::device_buffer{}, 0);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(0, column->null_count());
@@ -188,11 +160,8 @@ class NonNumericFactoryTest : public ColumnFactoryTest,
 TEST_P(NonNumericFactoryTest, NonNumericThrow)
 {
   auto construct = [this]() {
-    auto column = cudf::make_numeric_column(cudf::data_type{GetParam()},
-                                            this->size(),
-                                            cudf::mask_state::UNALLOCATED,
-                                            this->stream(),
-                                            this->mr());
+    auto column = cudf::make_numeric_column(
+      cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED);
   };
   EXPECT_THROW(construct(), cudf::logic_error);
 }
@@ -208,11 +177,8 @@ TYPED_TEST_SUITE(FixedWidthFactoryTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(FixedWidthFactoryTest, EmptyNoMask)
 {
-  auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                              0,
-                                              cudf::mask_state::UNALLOCATED,
-                                              this->stream(),
-                                              this->mr());
+  auto column = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, 0, cudf::mask_state::UNALLOCATED);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
 }
 
@@ -235,11 +201,8 @@ TYPED_TEST(EmptyFactoryTest, Empty)
 
 TYPED_TEST(FixedWidthFactoryTest, EmptyAllValidMask)
 {
-  auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                              0,
-                                              cudf::mask_state::ALL_VALID,
-                                              this->stream(),
-                                              this->mr());
+  auto column = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, 0, cudf::mask_state::ALL_VALID);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), 0);
   EXPECT_EQ(0, column->null_count());
@@ -250,11 +213,8 @@ TYPED_TEST(FixedWidthFactoryTest, EmptyAllValidMask)
 
 TYPED_TEST(FixedWidthFactoryTest, EmptyAllNullMask)
 {
-  auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                              0,
-                                              cudf::mask_state::ALL_NULL,
-                                              this->stream(),
-                                              this->mr());
+  auto column = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, 0, cudf::mask_state::ALL_NULL);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), 0);
   EXPECT_EQ(0, column->null_count());
@@ -265,11 +225,8 @@ TYPED_TEST(FixedWidthFactoryTest, EmptyAllNullMask)
 
 TYPED_TEST(FixedWidthFactoryTest, NoMask)
 {
-  auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                              this->size(),
-                                              cudf::mask_state::UNALLOCATED,
-                                              this->stream(),
-                                              this->mr());
+  auto column = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), cudf::mask_state::UNALLOCATED);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(0, column->null_count());
@@ -280,11 +237,8 @@ TYPED_TEST(FixedWidthFactoryTest, NoMask)
 
 TYPED_TEST(FixedWidthFactoryTest, UnitializedMask)
 {
-  auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                              this->size(),
-                                              cudf::mask_state::UNINITIALIZED,
-                                              this->stream(),
-                                              this->mr());
+  auto column = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), cudf::mask_state::UNINITIALIZED);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_TRUE(column->nullable());
@@ -293,11 +247,8 @@ TYPED_TEST(FixedWidthFactoryTest, UnitializedMask)
 
 TYPED_TEST(FixedWidthFactoryTest, AllValidMask)
 {
-  auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                              this->size(),
-                                              cudf::mask_state::ALL_VALID,
-                                              this->stream(),
-                                              this->mr());
+  auto column = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), cudf::mask_state::ALL_VALID);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(0, column->null_count());
@@ -308,11 +259,8 @@ TYPED_TEST(FixedWidthFactoryTest, AllValidMask)
 
 TYPED_TEST(FixedWidthFactoryTest, AllNullMask)
 {
-  auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                              this->size(),
-                                              cudf::mask_state::ALL_NULL,
-                                              this->stream(),
-                                              this->mr());
+  auto column = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), cudf::mask_state::ALL_NULL);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(this->size(), column->null_count());
@@ -327,9 +275,7 @@ TYPED_TEST(FixedWidthFactoryTest, NullMaskAsParm)
   auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                               this->size(),
                                               std::move(null_mask),
-                                              this->size(),
-                                              this->stream(),
-                                              this->mr());
+                                              this->size());
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(this->size(), column->null_count());
@@ -340,12 +286,8 @@ TYPED_TEST(FixedWidthFactoryTest, NullMaskAsParm)
 
 TYPED_TEST(FixedWidthFactoryTest, NullMaskAsEmptyParm)
 {
-  auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                              this->size(),
-                                              rmm::device_buffer{},
-                                              0,
-                                              this->stream(),
-                                              this->mr());
+  auto column = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), rmm::device_buffer{}, 0);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(0, column->null_count());
@@ -361,11 +303,8 @@ class NonFixedWidthFactoryTest : public ColumnFactoryTest,
 TEST_P(NonFixedWidthFactoryTest, NonFixedWidthThrow)
 {
   auto construct = [this]() {
-    auto column = cudf::make_fixed_width_column(cudf::data_type{GetParam()},
-                                                this->size(),
-                                                cudf::mask_state::UNALLOCATED,
-                                                this->stream(),
-                                                this->mr());
+    auto column = cudf::make_fixed_width_column(
+      cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED);
   };
   EXPECT_THROW(construct(), cudf::logic_error);
 }
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 7a5c738dc12..842ba801df0 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -2304,13 +2304,14 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall)
 {
   // internally, contiguous split chunks GPU work in 1MB contiguous copies
   // so the output buffer must be 1MB or larger.
-  EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024, mr()), cudf::logic_error);
+  EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error);
 }
 
 TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
 {
-  auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024, mr());
-  rmm::device_buffer buff(1 * 1024, cudf::get_default_stream(), mr());
+  auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024);
+  rmm::device_buffer buff(
+    1 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   cudf::device_span<uint8_t> too_small(static_cast<uint8_t*>(buff.data()), buff.size());
   std::size_t copied = 0;
   // throws because we created chunked_contig_split with 1MB, but we are giving
@@ -2321,8 +2322,9 @@ TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
 
 TEST_F(ContiguousSplitTableCornerCases, EmptyTableHasNextFalse)
 {
-  auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024, mr());
-  rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr());
+  auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024);
+  rmm::device_buffer buff(
+    1 * 1024 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
   EXPECT_EQ(chunked_pack->has_next(), false);  // empty input table
   std::size_t copied = 0;
@@ -2334,9 +2336,10 @@ TEST_F(ContiguousSplitTableCornerCases, ExhaustedHasNextFalse)
 {
   cudf::test::strings_column_wrapper a{"abc", "def", "ghi", "jkl", "mno", "", "st", "uvwx"};
   cudf::table_view t({a});
-  rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr());
+  rmm::device_buffer buff(
+    1 * 1024 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
-  auto chunked_pack = cudf::chunked_pack::create(t, buff.size(), mr());
+  auto chunked_pack = cudf::chunked_pack::create(t, buff.size());
   EXPECT_EQ(chunked_pack->has_next(), true);
   std::size_t copied = chunked_pack->next(bounce_buff);
   EXPECT_EQ(copied, chunked_pack->get_total_contiguous_size());
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index febae11832d..7da5c408a48 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -26,22 +26,17 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-class ScalarFactoryTest : public cudf::test::BaseFixture {
- public:
-  rmm::cuda_stream_view stream() { return cudf::get_default_stream(); }
-};
+class ScalarFactoryTest : public cudf::test::BaseFixture {};
 
 template <typename T>
-struct NumericScalarFactory : public ScalarFactoryTest {
-  static constexpr auto factory = cudf::make_numeric_scalar;
-};
+struct NumericScalarFactory : public ScalarFactoryTest {};
 
 TYPED_TEST_SUITE(NumericScalarFactory, cudf::test::NumericTypes);
 
 TYPED_TEST(NumericScalarFactory, FactoryDefault)
 {
   std::unique_ptr<cudf::scalar> s =
-    this->factory(cudf::data_type{cudf::type_to_id<TypeParam>()}, this->stream(), this->mr());
+    cudf::make_numeric_scalar(cudf::data_type{cudf::type_to_id<TypeParam>()});
 
   EXPECT_EQ(s->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_FALSE(s->is_valid());
@@ -50,7 +45,7 @@ TYPED_TEST(NumericScalarFactory, FactoryDefault)
 TYPED_TEST(NumericScalarFactory, TypeCast)
 {
   std::unique_ptr<cudf::scalar> s =
-    this->factory(cudf::data_type{cudf::type_to_id<TypeParam>()}, this->stream(), this->mr());
+    cudf::make_numeric_scalar(cudf::data_type{cudf::type_to_id<TypeParam>()});
 
   auto numeric_s = static_cast<cudf::scalar_type_t<TypeParam>*>(s.get());
 
@@ -62,16 +57,14 @@ TYPED_TEST(NumericScalarFactory, TypeCast)
 }
 
 template <typename T>
-struct TimestampScalarFactory : public ScalarFactoryTest {
-  static constexpr auto factory = cudf::make_timestamp_scalar;
-};
+struct TimestampScalarFactory : public ScalarFactoryTest {};
 
 TYPED_TEST_SUITE(TimestampScalarFactory, cudf::test::TimestampTypes);
 
 TYPED_TEST(TimestampScalarFactory, FactoryDefault)
 {
   std::unique_ptr<cudf::scalar> s =
-    this->factory(cudf::data_type{cudf::type_to_id<TypeParam>()}, this->stream(), this->mr());
+    cudf::make_timestamp_scalar(cudf::data_type{cudf::type_to_id<TypeParam>()});
 
   EXPECT_EQ(s->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_FALSE(s->is_valid());
@@ -80,7 +73,7 @@ TYPED_TEST(TimestampScalarFactory, FactoryDefault)
 TYPED_TEST(TimestampScalarFactory, TypeCast)
 {
   std::unique_ptr<cudf::scalar> s =
-    this->factory(cudf::data_type{cudf::type_to_id<TypeParam>()}, this->stream(), this->mr());
+    cudf::make_timestamp_scalar(cudf::data_type{cudf::type_to_id<TypeParam>()});
 
   auto numeric_s = static_cast<cudf::scalar_type_t<TypeParam>*>(s.get());
 
@@ -92,9 +85,7 @@ TYPED_TEST(TimestampScalarFactory, TypeCast)
 }
 
 template <typename T>
-struct DefaultScalarFactory : public ScalarFactoryTest {
-  static constexpr auto factory = cudf::make_default_constructed_scalar;
-};
+struct DefaultScalarFactory : public ScalarFactoryTest {};
 
 using MixedTypes = cudf::test::Concat<cudf::test::AllTypes, cudf::test::StringTypes>;
 TYPED_TEST_SUITE(DefaultScalarFactory, MixedTypes);
@@ -102,7 +93,7 @@ TYPED_TEST_SUITE(DefaultScalarFactory, MixedTypes);
 TYPED_TEST(DefaultScalarFactory, FactoryDefault)
 {
   std::unique_ptr<cudf::scalar> s =
-    this->factory(cudf::data_type{cudf::type_to_id<TypeParam>()}, this->stream(), this->mr());
+    cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_to_id<TypeParam>()});
 
   EXPECT_EQ(s->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_FALSE(s->is_valid());
@@ -111,7 +102,7 @@ TYPED_TEST(DefaultScalarFactory, FactoryDefault)
 TYPED_TEST(DefaultScalarFactory, TypeCast)
 {
   std::unique_ptr<cudf::scalar> s =
-    this->factory(cudf::data_type{cudf::type_to_id<TypeParam>()}, this->stream(), this->mr());
+    cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_to_id<TypeParam>()});
 
   auto numeric_s = static_cast<cudf::scalar_type_t<TypeParam>*>(s.get());
 
@@ -129,8 +120,7 @@ TYPED_TEST(FixedWidthScalarFactory, ValueProvided)
 {
   TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(54);
 
-  std::unique_ptr<cudf::scalar> s =
-    cudf::make_fixed_width_scalar<TypeParam>(value, this->stream(), this->mr());
+  std::unique_ptr<cudf::scalar> s = cudf::make_fixed_width_scalar<TypeParam>(value);
 
   auto numeric_s = static_cast<cudf::scalar_type_t<TypeParam>*>(s.get());
 
@@ -150,9 +140,8 @@ TYPED_TEST(FixedPointScalarFactory, ValueProvided)
   using namespace numeric;
   using decimalXX = TypeParam;
 
-  auto const rep_value = static_cast<typename decimalXX::rep>(123);
-  auto const s =
-    cudf::make_fixed_point_scalar<decimalXX>(123, scale_type{-2}, this->stream(), this->mr());
+  auto const rep_value      = static_cast<typename decimalXX::rep>(123);
+  auto const s              = cudf::make_fixed_point_scalar<decimalXX>(123, scale_type{-2});
   auto const fp_s           = static_cast<cudf::scalar_type_t<decimalXX>*>(s.get());
   auto const expected_dtype = cudf::data_type{cudf::type_to_id<decimalXX>(), -2};
 
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index e6c65b4e0e4..f7d3df18ffd 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -38,7 +38,6 @@
 
 template <typename T>
 struct ChronoColumnTest : public cudf::test::BaseFixture {
-  rmm::cuda_stream_view stream() { return cudf::get_default_stream(); }
   cudf::size_type size() { return cudf::size_type(100); }
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
@@ -188,9 +187,7 @@ TYPED_TEST(ChronoColumnTest, ChronoFactoryNullMaskAsParm)
   auto column = make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                         this->size(),
                                         std::move(null_mask),
-                                        this->size(),
-                                        this->stream(),
-                                        this->mr());
+                                        this->size());
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());
   EXPECT_EQ(this->size(), column->null_count());
@@ -202,12 +199,8 @@ TYPED_TEST(ChronoColumnTest, ChronoFactoryNullMaskAsParm)
 TYPED_TEST(ChronoColumnTest, ChronoFactoryNullMaskAsEmptyParm)
 {
   rmm::device_buffer null_mask{};
-  auto column = make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                        this->size(),
-                                        std::move(null_mask),
-                                        0,
-                                        this->stream(),
-                                        this->mr());
+  auto column = make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, this->size(), std::move(null_mask), 0);
 
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_to_id<TypeParam>()});
   EXPECT_EQ(column->size(), this->size());

From 0bcad6cfeb93a895285bcaf19ca694d2d8229347 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 11 Sep 2023 12:01:58 -0700
Subject: [PATCH 162/230] Remove debug print in a Parquet test (#14063)

Removed a debug print. That's it.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14063
---
 cpp/tests/io/parquet_test.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 3cd5c9f5593..64aca091686 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -6534,7 +6534,6 @@ TEST_F(ParquetReaderTest, FilterFloatNAN)
   auto col0 = cudf::test::fixed_width_column_wrapper<float>(elements, elements + num_rows);
   auto col1 = cudf::test::fixed_width_column_wrapper<double>(elements, elements + num_rows);
 
-  cudf::test::print(col0);
   auto const written_table = table_view{{col0, col1}};
   auto const filepath      = temp_env->get_temp_filepath("FilterFloatNAN.parquet");
   {

From bc304a29d244ad502fbdc6a304c5de0e99aeb57c Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Mon, 11 Sep 2023 14:28:37 -0500
Subject: [PATCH 163/230] Produce a fatal error if cudf is unable to find
 pyarrow include directory (#13976)

Produce a fatal error if cudf python is unable to find pyarrow include directory. Previously the failure only presented itself while trying to compile cython files which failed to include headers from pyarrow.

_Previously:_

```
    FAILED: cudf/_lib/CMakeFiles/avro.dir/avro.cxx.o
    /usr/bin/sccache /usr/bin/g++ -DFMT_HEADER_ONLY=1 -DSPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO -DSPDLOG_FMT_EXTERNAL -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -Davro_EXPORTS -I/usr/include/python3.10 -I/home/coder/.local/share/venvs/rapids/lib/python3.10/site-packages/numpy/core/include -I/home/coder/cudf/python/cudf/cudf/_lib -I/home/coder/cudf/cpp/build/release/_deps/libcudacxx-src/lib/cmake/libcudacxx/../../../include -I/home/coder/cudf/cpp/build/release/_deps/thrust-src -I/home/coder/cudf/cpp/build/release/_deps/thrust-src/dependencies/cub -isystem /home/coder/cudf/cpp/build/release/_deps/dlpack-src/include -isystem /home/coder/cudf/cpp/build/release/_deps/jitify-src -isystem /home/coder/cudf/cpp/include -isystem /home/coder/cudf/cpp/build/release/include -isystem /home/coder/rmm/include -isystem /usr/local/cuda/include -isystem /home/coder/fmt/include -isystem /home/coder/rmm/build/release/_deps/spdlog-src/include -O3 -DNDEBUG -fPIC -MD -MT cudf/_lib/CMakeFiles/avro.dir/avro.cxx.o -MF cudf/_lib/CMakeFiles/avro.dir/avro.cxx.o.d -o cudf/_lib/CMakeFiles/avro.dir/avro.cxx.o -c /home/coder/cudf/python/cudf/_skbuild/linux-x86_64-3.10/cmake-build/cudf/_lib/avro.cxx
    /home/coder/cudf/python/cudf/_skbuild/linux-x86_64-3.10/cmake-build/cudf/_lib/avro.cxx:1291:10: fatal error: arrow/python/platform.h: No such file or directory
     1291 | #include "arrow/python/platform.h"
          |          ^~~~~~~~~~~~~~~~~~~~~~~~~
    compilation terminated.
    [6/23] Building CXX object cudf/_lib/CMakeFiles/csv.dir/csv.cxx.o
    FAILED: cudf/_lib/CMakeFiles/csv.dir/csv.cxx.o
    /usr/bin/sccache /usr/bin/g++ -DFMT_HEADER_ONLY=1 -DSPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO -DSPDLOG_FMT_EXTERNAL -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -Dcsv_EXPORTS -I/usr/include/python3.10 -I/home/coder/.local/share/venvs/rapids/lib/python3.10/site-packages/numpy/core/include -I/home/coder/cudf/python/cudf/cudf/_lib -I/home/coder/cudf/cpp/build/release/_deps/libcudacxx-src/lib/cmake/libcudacxx/../../../include -I/home/coder/cudf/cpp/build/release/_deps/thrust-src -I/home/coder/cudf/cpp/build/release/_deps/thrust-src/dependencies/cub -isystem /home/coder/cudf/cpp/build/release/_deps/dlpack-src/include -isystem /home/coder/cudf/cpp/build/release/_deps/jitify-src -isystem /home/coder/cudf/cpp/include -isystem /home/coder/cudf/cpp/build/release/include -isystem /home/coder/rmm/include -isystem /usr/local/cuda/include -isystem /home/coder/fmt/include -isystem /home/coder/rmm/build/release/_deps/spdlog-src/include -O3 -DNDEBUG -fPIC -MD -MT cudf/_lib/CMakeFiles/csv.dir/csv.cxx.o -MF cudf/_lib/CMakeFiles/csv.dir/csv.cxx.o.d -o cudf/_lib/CMakeFiles/csv.dir/csv.cxx.o -c /home/coder/cudf/python/cudf/_skbuild/linux-x86_64-3.10/cmake-build/cudf/_lib/csv.cxx
    /home/coder/cudf/python/cudf/_skbuild/linux-x86_64-3.10/cmake-build/cudf/_lib/csv.cxx:1292:10: fatal error: arrow/python/platform.h: No such file or directory
     1292 | #include "arrow/python/platform.h"
          |          ^~~~~~~~~~~~~~~~~~~~~~~~~
    compilation terminated.
...
```
_With these changes:_
```
    CMake Error at cudf/_lib/CMakeLists.txt:107 (message):
      Error while trying to obtain pyarrow include dir:

      Traceback (most recent call last):

        File "<string>", line 1, in <module>
        File "/home/coder/.local/share/venvs/rapids/lib/python3.10/site-packages/pyarrow/__init__.py", line 65, in <module>
          import pyarrow.lib as _lib

      ImportError: libarrow.so.1200: cannot open shared object file: No such file
      or directory
```

Authors:
  - Christopher Harris (https://github.com/cwharris)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/13976
---
 python/cudf/cudf/_lib/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 06de6cc825f..947659c290a 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -98,9 +98,15 @@ find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
 execute_process(
   COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
   OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  ERROR_VARIABLE PYARROW_ERROR
+  RESULT_VARIABLE PYARROW_RESULT
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
 
+if(${PYARROW_RESULT})
+  message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
+endif()
+
 set(targets_using_arrow_headers interop avro csv orc json parquet)
 foreach(target IN LISTS targets_using_arrow_headers)
   target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")

From c3bf70595210d684fd747a927e59abc739aea8cf Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 11 Sep 2023 20:19:17 -0500
Subject: [PATCH 164/230] Fix renaming `Series` and `Index` (#14080)

This PR resolves renaming `Series` and `Index` by assigning `no_default` to internal API default parameters.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14080
---
 python/cudf/cudf/core/index.py        | 10 +++++----
 python/cudf/cudf/core/series.py       |  4 ++--
 python/cudf/cudf/testing/_utils.py    | 27 ++++++++++++++++++++++++
 python/cudf/cudf/tests/test_binops.py | 30 ++-------------------------
 python/cudf/cudf/tests/test_index.py  | 13 ++++++++----
 python/cudf/cudf/tests/test_series.py | 15 ++++++++++++++
 6 files changed, 61 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 4bb5428838f..57c481db0d8 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -28,6 +28,7 @@
 from cudf._lib.filling import sequence
 from cudf._lib.search import search_sorted
 from cudf._lib.types import size_type_dtype
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     is_categorical_dtype,
@@ -95,7 +96,7 @@ def _lexsorted_equal_range(
     return lower_bound, upper_bound, sort_inds
 
 
-def _index_from_data(data: MutableMapping, name: Any = None):
+def _index_from_data(data: MutableMapping, name: Any = no_default):
     """Construct an index of the appropriate type from some data."""
 
     if len(data) == 0:
@@ -131,7 +132,7 @@ def _index_from_data(data: MutableMapping, name: Any = None):
 
 
 def _index_from_columns(
-    columns: List[cudf.core.column.ColumnBase], name: Any = None
+    columns: List[cudf.core.column.ColumnBase], name: Any = no_default
 ):
     """Construct an index from ``columns``, with levels named 0, 1, 2..."""
     return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)
@@ -1032,10 +1033,10 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     @classmethod
     @_cudf_nvtx_annotate
     def _from_data(
-        cls, data: MutableMapping, name: Any = None
+        cls, data: MutableMapping, name: Any = no_default
     ) -> GenericIndex:
         out = super()._from_data(data=data)
-        if name is not None:
+        if name is not no_default:
             out.name = name
         return out
 
@@ -3334,6 +3335,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
         - DatetimeIndex for Datetime input.
         - GenericIndex for all other inputs.
     """
+
     kwargs = _setdefault_name(arbitrary, **kwargs)
     if isinstance(arbitrary, cudf.MultiIndex):
         return arbitrary
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 78be3085754..f44a3123dd3 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -605,10 +605,10 @@ def _from_data(
         cls,
         data: MutableMapping,
         index: Optional[BaseIndex] = None,
-        name: Any = None,
+        name: Any = no_default,
     ) -> Series:
         out = super()._from_data(data=data, index=index)
-        if name is not None:
+        if name is not no_default:
             out.name = name
         return out
 
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 0489329d801..e949f7d78e7 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -48,6 +48,33 @@
 OTHER_TYPES = sorted(list(dtypeutils.OTHER_TYPES))
 ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES))
 
+SERIES_OR_INDEX_NAMES = [
+    None,
+    pd.NA,
+    cudf.NA,
+    np.nan,
+    float("NaN"),
+    "abc",
+    1,
+    pd.NaT,
+    np.datetime64("nat"),
+    np.timedelta64("NaT"),
+    np.timedelta64(10, "D"),
+    np.timedelta64(5, "D"),
+    np.datetime64("1970-01-01 00:00:00.000000001"),
+    np.datetime64("1970-01-01 00:00:00.000000002"),
+    pd.Timestamp(1),
+    pd.Timestamp(2),
+    pd.Timedelta(1),
+    pd.Timedelta(2),
+    Decimal("NaN"),
+    Decimal("1.2"),
+    np.int64(1),
+    np.int32(1),
+    np.float32(1),
+    pd.Timestamp(1),
+]
+
 
 def set_random_null_mask_inplace(series, null_probability=0.5, seed=None):
     """Randomly nullify elements in series with the provided probability."""
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 549cd8da78e..87d510927ae 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -150,32 +150,6 @@
     lambda x: cudf.Scalar(0) / x,
 ]
 
-_series_or_index_names = [
-    None,
-    pd.NA,
-    cudf.NA,
-    np.nan,
-    float("NaN"),
-    "abc",
-    1,
-    pd.NaT,
-    np.datetime64("nat"),
-    np.timedelta64("NaT"),
-    np.timedelta64(10, "D"),
-    np.timedelta64(5, "D"),
-    np.datetime64("1970-01-01 00:00:00.000000001"),
-    np.datetime64("1970-01-01 00:00:00.000000002"),
-    pd.Timestamp(1),
-    pd.Timestamp(2),
-    pd.Timedelta(1),
-    pd.Timedelta(2),
-    decimal.Decimal("NaN"),
-    decimal.Decimal("1.2"),
-    np.int64(1),
-    np.int32(1),
-    np.float32(1),
-    pd.Timestamp(1),
-]
 
 pytest_xfail = pytest.mark.xfail
 pytestmark = pytest.mark.spilling
@@ -3315,8 +3289,8 @@ def test_binop_index_series(op):
     utils.assert_eq(expected, actual)
 
 
-@pytest.mark.parametrize("name1", _series_or_index_names)
-@pytest.mark.parametrize("name2", _series_or_index_names)
+@pytest.mark.parametrize("name1", utils.SERIES_OR_INDEX_NAMES)
+@pytest.mark.parametrize("name2", utils.SERIES_OR_INDEX_NAMES)
 def test_binop_index_dt_td_series_with_names(name1, name2):
     gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]", name=name1)
     gs = cudf.Series([10, 11, 12], dtype="timedelta64[ns]", name=name2)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 58dbc48e31e..f7f6e1f9114 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -24,6 +24,7 @@
     FLOAT_TYPES,
     NUMERIC_TYPES,
     OTHER_TYPES,
+    SERIES_OR_INDEX_NAMES,
     SIGNED_INTEGER_TYPES,
     SIGNED_TYPES,
     UNSIGNED_TYPES,
@@ -227,12 +228,16 @@ def test_pandas_as_index():
     )
 
 
-def test_index_rename():
-    pds = pd.Index([1, 2, 3], name="asdf")
+@pytest.mark.parametrize("initial_name", SERIES_OR_INDEX_NAMES)
+@pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES)
+def test_index_rename(initial_name, name):
+    pds = pd.Index([1, 2, 3], name=initial_name)
     gds = as_index(pds)
 
-    expect = pds.rename("new_name")
-    got = gds.rename("new_name")
+    assert_eq(pds, gds)
+
+    expect = pds.rename(name)
+    got = gds.rename(name)
 
     assert_eq(expect, got)
     """
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 783d7d31d7f..8a652caa6e2 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -16,6 +16,7 @@
 from cudf.core._compat import PANDAS_LT_140
 from cudf.testing._utils import (
     NUMERIC_TYPES,
+    SERIES_OR_INDEX_NAMES,
     TIMEDELTA_TYPES,
     _create_pandas_series,
     assert_eq,
@@ -2267,3 +2268,17 @@ def test_series_unique_pandas_compatibility():
         actual = gs.unique()
     expected = ps.unique()
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("initial_name", SERIES_OR_INDEX_NAMES)
+@pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES)
+def test_series_rename(initial_name, name):
+    gsr = cudf.Series([1, 2, 3], name=initial_name)
+    psr = pd.Series([1, 2, 3], name=initial_name)
+
+    assert_eq(gsr, psr)
+
+    actual = gsr.rename(name)
+    expected = psr.rename(name)
+
+    assert_eq(actual, expected)

From 1911d33231ac9caeaf5310173bf6a47ffca35fe8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 11 Sep 2023 22:55:32 -0500
Subject: [PATCH 165/230] Fix various issues in `Index.intersection` (#14054)

This PR fixes multiple issues with `Index.intersection`:

- [x] Fixes issues with handling empty inputs, closes #14020
- [x] Adds validation for inputs.
- [x] Properly handles various types in `intersection` implementation and fix `RangeIndex.intersection` by having a separate implementation for it.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14054
---
 python/cudf/cudf/core/_base_index.py        | 23 +++++++++---
 python/cudf/cudf/core/index.py              |  6 ++--
 python/cudf/cudf/core/join/_join_helpers.py |  1 +
 python/cudf/cudf/tests/test_index.py        | 40 ++++++++++++++++-----
 python/cudf/cudf/utils/dtypes.py            | 10 ++++++
 5 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 8091f3f7dd2..2f6e864b51c 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -608,8 +608,14 @@ def intersection(self, other, sort=False):
                     (1, 'Blue')],
                 )
         """
+        if not can_convert_to_column(other):
+            raise TypeError("Input must be Index or array-like")
+
         if not isinstance(other, BaseIndex):
-            other = cudf.Index(other, name=self.name)
+            other = cudf.Index(
+                other,
+                name=getattr(other, "name", self.name),
+            )
 
         if sort not in {None, False}:
             raise ValueError(
@@ -617,10 +623,17 @@ def intersection(self, other, sort=False):
                 f"None or False; {sort} was passed."
             )
 
-        if self.equals(other):
-            if self.has_duplicates:
-                return self.unique()._get_reconciled_name_object(other)
-            return self._get_reconciled_name_object(other)
+        if not len(self) or not len(other) or self.equals(other):
+            common_dtype = cudf.utils.dtypes._dtype_pandas_compatible(
+                cudf.utils.dtypes.find_common_type([self.dtype, other.dtype])
+            )
+
+            lhs = self.unique() if self.has_duplicates else self
+            rhs = other
+            if not len(other):
+                lhs, rhs = rhs, lhs
+
+            return lhs._get_reconciled_name_object(rhs).astype(common_dtype)
 
         res_name = _get_result_name(self.name, other.name)
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 57c481db0d8..56ec9ce0359 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -682,7 +682,9 @@ def _union(self, other, sort=None):
     @_cudf_nvtx_annotate
     def _intersection(self, other, sort=False):
         if not isinstance(other, RangeIndex):
-            return super()._intersection(other, sort=sort)
+            return self._try_reconstruct_range_index(
+                super()._intersection(other, sort=sort)
+            )
 
         if not len(self) or not len(other):
             return RangeIndex(0)
@@ -723,7 +725,7 @@ def _intersection(self, other, sort=False):
         if sort is None:
             new_index = new_index.sort_values()
 
-        return new_index
+        return self._try_reconstruct_range_index(new_index)
 
     @_cudf_nvtx_annotate
     def difference(self, other, sort=None):
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 7d799fa1573..1071261044f 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -74,6 +74,7 @@ def _match_join_keys(
             common_type = ltype.categories.dtype
         else:
             common_type = rtype.categories.dtype
+        common_type = cudf.utils.dtypes._dtype_pandas_compatible(common_type)
         return lcol.astype(common_type), rcol.astype(common_type)
 
     if is_dtype_equal(ltype, rtype):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index f7f6e1f9114..6fb615c22e0 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -11,6 +11,7 @@
 import pytest
 
 import cudf
+from cudf.api.types import is_bool_dtype
 from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200
 from cudf.core.index import (
     CategoricalIndex,
@@ -2104,25 +2105,48 @@ def test_union_index(idx1, idx2, sort):
         (pd.Index([0, 1, 2, 30], name=pd.NA), pd.Index([30, 0, 90, 100])),
         (pd.Index([0, 1, 2, 30], name="a"), [90, 100]),
         (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])),
-        (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])),
+        (
+            pd.Index(["a", "b", "c", "d", "c"]),
+            pd.Index(["a", "c", "z"], name="abc"),
+        ),
         (
             pd.Index(["a", "b", "c", "d", "c"]),
             pd.Index(["a", "b", "c", "d", "c"]),
         ),
         (pd.Index([True, False, True, True]), pd.Index([10, 11, 12, 0, 1, 2])),
         (pd.Index([True, False, True, True]), pd.Index([True, True])),
+        (pd.RangeIndex(0, 10, name="a"), pd.Index([5, 6, 7], name="b")),
+        (pd.Index(["a", "b", "c"], dtype="category"), pd.Index(["a", "b"])),
+        (pd.Index(["a", "b", "c"], dtype="category"), pd.Index([1, 2, 3])),
+        (pd.Index([0, 1, 2], dtype="category"), pd.RangeIndex(0, 10)),
+        (pd.Index(["a", "b", "c"], name="abc"), []),
+        (pd.Index([], name="abc"), pd.RangeIndex(0, 4)),
+        (pd.Index([1, 2, 3]), pd.Index([1, 2], dtype="category")),
+        (pd.Index([]), pd.Index([1, 2], dtype="category")),
     ],
 )
 @pytest.mark.parametrize("sort", [None, False])
-def test_intersection_index(idx1, idx2, sort):
+@pytest.mark.parametrize("pandas_compatible", [True, False])
+def test_intersection_index(idx1, idx2, sort, pandas_compatible):
     expected = idx1.intersection(idx2, sort=sort)
 
-    idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1
-    idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2
-
-    actual = idx1.intersection(idx2, sort=sort)
-
-    assert_eq(expected, actual, exact=False)
+    with cudf.option_context("mode.pandas_compatible", pandas_compatible):
+        idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1
+        idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2
+
+        actual = idx1.intersection(idx2, sort=sort)
+
+        # TODO: Resolve the bool vs ints mixed issue
+        # once pandas has a direction on this issue
+        # https://github.com/pandas-dev/pandas/issues/44000
+        assert_eq(
+            expected,
+            actual,
+            exact=False
+            if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype))
+            or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype))
+            else True,
+        )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index e50457b8e7b..1b94db75340 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -634,6 +634,16 @@ def find_common_type(dtypes):
     return cudf.dtype(common_dtype)
 
 
+def _dtype_pandas_compatible(dtype):
+    """
+    A utility function, that returns `str` instead of `object`
+    dtype when pandas comptibility mode is enabled.
+    """
+    if cudf.get_option("mode.pandas_compatible") and dtype == cudf.dtype("O"):
+        return "str"
+    return dtype
+
+
 def _can_cast(from_dtype, to_dtype):
     """
     Utility function to determine if we can cast

From 72c958380f42dac5bd04492043cfd569fdcd5f0a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 12 Sep 2023 09:39:15 -0500
Subject: [PATCH 166/230] Add fallback matrix for nvcomp. (#14082)

Some platforms (such as aarch64 + CUDA 12) don't have a matching matrix entry for nvcomp. This PR adds a fallback matrix entry so it is possible to attempt local development on aarch64 with CUDA 12.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14082
---
 dependencies.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dependencies.yaml b/dependencies.yaml
index f99b7404854..398ae193fe6 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -241,6 +241,10 @@ dependencies:
               cuda: "11.8"
             packages:
               - *nvcomp
+          # TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3,
+          # all CUDA/arch combinations should be supported by existing packages.
+          - matrix:
+            packages:
   build_wheels:
     common:
       - output_types: pyproject

From 258e0fef942b734af24adf612b7998cb5da523c5 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Tue, 12 Sep 2023 15:09:22 -0600
Subject: [PATCH 167/230] [Java] Add recoverWithNull to JSONOptions and pass to
 Table.readJSON (#14078)

This PR exposes the recently added `json_reader_options_builder::recovery_mode` option in the JNI layer.


closes #14073

Authors:
  - Andy Grove (https://github.com/andygrove)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Raza Jafri (https://github.com/razajafri)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14078
---
 .../main/java/ai/rapids/cudf/JSONOptions.java | 25 +++++++++++++-
 java/src/main/java/ai/rapids/cudf/Table.java  | 12 ++++---
 java/src/main/native/src/TableJni.cpp         | 18 +++++++---
 .../test/java/ai/rapids/cudf/TableTest.java   | 34 +++++++++++++++++++
 .../resources/people_with_invalid_lines.json  |  4 +++
 5 files changed, 83 insertions(+), 10 deletions(-)
 create mode 100644 java/src/test/resources/people_with_invalid_lines.json

diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 85a9eb7beb3..f98687df5fa 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -29,11 +29,13 @@ public final class JSONOptions extends ColumnFilterOptions {
 
   private final boolean dayFirst;
   private final boolean lines;
+  private final boolean recoverWithNull;
 
   private JSONOptions(Builder builder) {
     super(builder);
     dayFirst = builder.dayFirst;
     lines = builder.lines;
+    recoverWithNull = builder.recoverWithNull;
   }
 
   public boolean isDayFirst() {
@@ -44,6 +46,11 @@ public boolean isLines() {
     return lines;
   }
 
+  /** Return the value of the recoverWithNull option */
+  public boolean isRecoverWithNull() {
+    return recoverWithNull;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -57,6 +64,8 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean dayFirst = false;
     private boolean lines = true;
 
+    private boolean recoverWithNull = false;
+
     /**
      * Whether to parse dates as DD/MM versus MM/DD
      * @param dayFirst true: DD/MM, false, MM/DD
@@ -78,6 +87,20 @@ public Builder withLines(boolean perLine) {
       return this;
     }
 
+    /**
+     * Specify how to handle invalid lines when parsing json. Setting
+     * recoverWithNull to true will cause null values to be returned
+     * for invalid lines. Setting recoverWithNull to false will cause
+     * the parsing to fail with an exception.
+     *
+     * @param recoverWithNull true: return nulls, false: throw exception
+     * @return builder for chaining
+     */
+    public Builder withRecoverWithNull(boolean recoverWithNull) {
+      this.recoverWithNull = recoverWithNull;
+      return this;
+    }
+
     @Override
     public Builder includeColumn(String... names) {
       throw new UnsupportedOperationException("JSON reader didn't support column prune");
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b2eb33d47dc..51a33ebb72f 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -241,10 +241,11 @@ private static native long[] readCSV(String[] columnNames,
   private static native long readJSON(String[] columnNames,
                                         int[] dTypeIds, int[] dTypeScales,
                                         String filePath, long address, long length,
-                                        boolean dayFirst, boolean lines) throws CudfException;
+                                        boolean dayFirst, boolean lines,
+                                        boolean recoverWithNulls) throws CudfException;
 
   private static native long readAndInferJSON(long address, long length,
-      boolean dayFirst, boolean lines) throws CudfException;
+      boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1047,7 +1048,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
             readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
                     path.getAbsolutePath(),
                     0, 0,
-                    opts.isDayFirst(), opts.isLines()))) {
+                    opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()))) {
 
       return gatherJSONColumns(schema, twm);
     }
@@ -1099,7 +1100,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
-        opts.isDayFirst(), opts.isLines()));
+        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()));
   }
 
   /**
@@ -1121,7 +1122,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     assert offset >= 0 && offset < buffer.length;
     try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), null,
-            buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) {
+            buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
+            opts.isRecoverWithNull()))) {
       return gatherJSONColumns(schema, twm);
     }
   }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index b05fc9b7bc4..b208ef8f381 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1331,7 +1331,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
-    JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
+    JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
+    jboolean recover_with_null) {
 
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1344,9 +1345,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
                                         static_cast<std::size_t>(buffer_length)};
 
+    auto const recovery_mode = recover_with_null ?
+                                   cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
+                                   cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
-                                                     .lines(static_cast<bool>(lines));
+                                                     .lines(static_cast<bool>(lines))
+                                                     .recovery_mode(recovery_mode);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1404,7 +1409,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
+    jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
+    jboolean recover_with_null) {
 
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1448,9 +1454,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
                                                       static_cast<std::size_t>(buffer_length)} :
                                 cudf::io::source_info{filename.get()};
 
+    cudf::io::json_recovery_mode_t recovery_mode =
+        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
+                            cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
-                                                     .lines(static_cast<bool>(lines));
+                                                     .lines(static_cast<bool>(lines))
+                                                     .recovery_mode(recovery_mode);
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 3740328615a..59f0d180c6e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -86,6 +86,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
   private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
+  private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -326,6 +327,39 @@ void testReadJSONFile() {
     }
   }
 
+  @Test
+  void testReadJSONFileWithInvalidLines() {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "name")
+            .column(DType.INT32, "age")
+            .build();
+
+    // test with recoverWithNulls=true
+    {
+      JSONOptions opts = JSONOptions.builder()
+              .withLines(true)
+              .withRecoverWithNull(true)
+              .build();
+      try (Table expected = new Table.TestBuilder()
+              .column("Michael", "Andy", null, "Justin")
+              .column(null, 30, null, 19)
+              .build();
+           Table table = Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)) {
+        assertTablesAreEqual(expected, table);
+      }
+    }
+
+    // test with recoverWithNulls=false
+    {
+      JSONOptions opts = JSONOptions.builder()
+              .withLines(true)
+              .withRecoverWithNull(false)
+              .build();
+      assertThrows(CudfException.class, () ->
+        Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE));
+    }
+  }
+
   @Test
   void testReadJSONFileWithDifferentColumnOrder() {
     Schema schema = Schema.builder()
diff --git a/java/src/test/resources/people_with_invalid_lines.json b/java/src/test/resources/people_with_invalid_lines.json
new file mode 100644
index 00000000000..a99592e3eca
--- /dev/null
+++ b/java/src/test/resources/people_with_invalid_lines.json
@@ -0,0 +1,4 @@
+{"name":"Michael"}
+{"name":"Andy", "age":30}
+this_line_is_not_valid
+{"name":"Justin", "age":19}

From 3be772fc5560127ff0ba6ad99d1cf618176e57fd Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 12 Sep 2023 17:26:49 -0700
Subject: [PATCH 168/230] Global stream pool (#13922)

#13637 added a static stream pool object for use by the Parquet reader. This PR expands upon that by:

- Moving the stream pool to the `cudf::detail` namespace.
- Adding a debugging implementation that always returns the default stream.
- Hiding implementation details behind a more streamlined interface.
- Using cuda events for synchronization.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/13922
---
 cpp/CMakeLists.txt                            |   1 +
 .../cudf/detail/utilities/stream_pool.hpp     |  64 +++++
 cpp/src/io/parquet/reader_impl.cpp            |  43 +--
 cpp/src/io/text/multibyte_split.cu            |  48 +---
 cpp/src/utilities/stream_pool.cpp             | 256 ++++++++++++++++++
 5 files changed, 341 insertions(+), 71 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/stream_pool.hpp
 create mode 100644 cpp/src/utilities/stream_pool.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 516865e5782..c37d05a21c7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -633,6 +633,7 @@ add_library(
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
   src/utilities/stacktrace.cpp
+  src/utilities/stream_pool.cpp
   src/utilities/traits.cpp
   src/utilities/type_checks.cpp
   src/utilities/type_dispatcher.cpp
diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp
new file mode 100644
index 00000000000..95384a9d73e
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cstddef>
+#include <vector>
+
+namespace cudf::detail {
+
+/**
+ * @brief Acquire a set of `cuda_stream_view` objects and synchronize them to an event on another
+ * stream.
+ *
+ * By default an underlying `rmm::cuda_stream_pool` is used to obtain the streams. The only other
+ * implementation at present is a debugging version that always returns the stream returned by
+ * `cudf::get_default_stream()`. To use this debugging version, set the environment variable
+ * `LIBCUDF_USE_DEBUG_STREAM_POOL`.
+ *
+ * Example usage:
+ * @code{.cpp}
+ * auto stream = cudf::get_default_stream();
+ * auto const num_streams = 2;
+ * // do work on stream
+ * // allocate streams and wait for an event on stream before executing on any of streams
+ * auto streams = cudf::detail::fork_stream(stream, num_streams);
+ * // do work on streams[0] and streams[1]
+ * // wait for event on streams before continuing to do work on stream
+ * cudf::detail::join_streams(streams, stream);
+ * @endcode
+ *
+ * @param stream Stream that the returned streams will wait on.
+ * @param count The number of `cuda_stream_view` objects to return.
+ * @return Vector containing `count` stream views.
+ */
+[[nodiscard]] std::vector<rmm::cuda_stream_view> fork_streams(rmm::cuda_stream_view stream,
+                                                              std::size_t count);
+
+/**
+ * @brief Synchronize a stream to an event on a set of streams.
+ *
+ * @param streams Streams to wait on.
+ * @param stream Joined stream that synchronizes with the waited-on streams.
+ */
+void join_streams(host_span<rmm::cuda_stream_view const> streams, rmm::cuda_stream_view stream);
+
+}  // namespace cudf::detail
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 8a73c43be3e..8b0a0bd4eb0 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -18,31 +18,15 @@
 
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/transform.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 
+#include <bitset>
 #include <numeric>
 
 namespace cudf::io::detail::parquet {
 
-namespace {
-
-int constexpr NUM_DECODERS       = 3;  // how many decode kernels are there to run
-int constexpr APPROX_NUM_THREADS = 4;  // guestimate from DaveB
-int constexpr STREAM_POOL_SIZE   = NUM_DECODERS * APPROX_NUM_THREADS;
-
-auto& get_stream_pool()
-{
-  // TODO: creating this on the heap because there were issues with trying to call the
-  // stream pool destructor during cuda shutdown that lead to a segmentation fault in
-  // nvbench. this allocation is being deliberately leaked to avoid the above, but still
-  // results in non-fatal warnings when running nvbench in cuda-gdb.
-  static auto pool = new rmm::cuda_stream_pool{STREAM_POOL_SIZE};
-  return *pool;
-}
-
-}  // namespace
-
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
   auto& chunks              = _file_itm_data.chunks;
@@ -178,34 +162,33 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
-  _stream.synchronize();
 
-  auto const level_type_size = _file_itm_data.level_type_size;
+  // get the number of streams we need from the pool and tell them to wait on the H2D copies
+  int const nkernels = std::bitset<32>(kernel_mask).count();
+  auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
-  // vector of launched streams
-  std::vector<rmm::cuda_stream_view> streams;
+  auto const level_type_size = _file_itm_data.level_type_size;
 
   // launch string decoder
+  int s_idx = 0;
   if (has_strings) {
-    streams.push_back(get_stream_pool().get_stream());
-    chunk_nested_str_data.host_to_device_async(streams.back());
-    gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+    auto& stream = streams[s_idx++];
+    chunk_nested_str_data.host_to_device_async(stream);
+    gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, stream);
   }
 
   // launch delta binary decoder
   if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    streams.push_back(get_stream_pool().get_stream());
-    gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+    gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
   if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    streams.push_back(get_stream_pool().get_stream());
-    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
   }
 
   // synchronize the streams
-  std::for_each(streams.begin(), streams.end(), [](auto& stream) { stream.synchronize(); });
+  cudf::detail::join_streams(streams, _stream);
 
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 818bbc0a18a..772bcad8ada 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
@@ -32,7 +33,6 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -301,44 +301,12 @@ namespace io {
 namespace text {
 namespace detail {
 
-void fork_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
-{
-  cudaEvent_t event;
-  CUDF_CUDA_TRY(cudaEventCreate(&event));
-  CUDF_CUDA_TRY(cudaEventRecord(event, stream));
-  for (uint32_t i = 0; i < streams.size(); i++) {
-    CUDF_CUDA_TRY(cudaStreamWaitEvent(streams[i], event, 0));
-  }
-  CUDF_CUDA_TRY(cudaEventDestroy(event));
-}
-
-void join_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
-{
-  cudaEvent_t event;
-  CUDF_CUDA_TRY(cudaEventCreate(&event));
-  for (uint32_t i = 0; i < streams.size(); i++) {
-    CUDF_CUDA_TRY(cudaEventRecord(event, streams[i]));
-    CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
-  }
-  CUDF_CUDA_TRY(cudaEventDestroy(event));
-}
-
-std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_pool& stream_pool)
-{
-  auto streams = std::vector<rmm::cuda_stream_view>();
-  for (int32_t i = 0; i < count; i++) {
-    streams.emplace_back(stream_pool.get_stream());
-  }
-  return streams;
-}
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
                                               bool strip_delimiters,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr,
-                                              rmm::cuda_stream_pool& stream_pool)
+                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -365,8 +333,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   CUDF_EXPECTS(delimiter.size() < multistate::max_segment_value,
                "delimiter contains too many total tokens to produce a deterministic result.");
 
-  auto concurrency = 2;
-  auto streams     = get_streams(concurrency, stream_pool);
+  auto const concurrency = 2;
 
   // must be at least 32 when using warp-reduce on partials
   // must be at least 1 more than max possible concurrent tiles
@@ -411,7 +378,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
   output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
 
-  fork_stream(streams, stream);
+  auto streams = cudf::detail::fork_streams(stream, concurrency);
 
   cudaEvent_t last_launch_event;
   CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
@@ -532,7 +499,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
   CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
 
-  join_stream(streams, stream);
+  cudf::detail::join_streams(streams, stream);
 
   // if the input was empty, we didn't find a delimiter at all,
   // or the first delimiter was also the last: empty output
@@ -602,11 +569,10 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               parse_options options,
                                               rmm::mr::device_memory_resource* mr)
 {
-  auto stream      = cudf::get_default_stream();
-  auto stream_pool = rmm::cuda_stream_pool(2);
+  auto stream = cudf::get_default_stream();
 
   auto result = detail::multibyte_split(
-    source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool);
+    source, delimiter, options.byte_range, options.strip_delimiters, stream, mr);
 
   return result;
 }
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
new file mode 100644
index 00000000000..b3b20889ef8
--- /dev/null
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace cudf::detail {
+
+namespace {
+
+// TODO: what is a good number here. what's the penalty for making it larger?
+// Dave Baranec rule of thumb was max_streams_needed * num_concurrent_threads,
+// where num_concurrent_threads was estimated to be 4. so using 32 will allow
+// for 8 streams per thread, which should be plenty (decoding will be up to 4
+// kernels when delta_byte_array decoding is added). rmm::cuda_stream_pool
+// defaults to 16.
+std::size_t constexpr STREAM_POOL_SIZE = 32;
+
+// FIXME: "borrowed" from rmm...remove when this stream pool is moved there
+#ifdef NDEBUG
+#define CUDF_ASSERT_CUDA_SUCCESS(_call) \
+  do {                                  \
+    (_call);                            \
+  } while (0);
+#else
+#define CUDF_ASSERT_CUDA_SUCCESS(_call)                                         \
+  do {                                                                          \
+    cudaError_t const status__ = (_call);                                       \
+    if (status__ != cudaSuccess) {                                              \
+      std::cerr << "CUDA Error detected. " << cudaGetErrorName(status__) << " " \
+                << cudaGetErrorString(status__) << std::endl;                   \
+    }                                                                           \
+    /* NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay) */   \
+    assert(status__ == cudaSuccess);                                            \
+  } while (0)
+#endif
+
+class cuda_stream_pool {
+ public:
+  // matching type used in rmm::cuda_stream_pool::get_stream(stream_id)
+  using stream_id_type = std::size_t;
+
+  virtual ~cuda_stream_pool() = default;
+
+  /**
+   * @brief Get a `cuda_stream_view` of a stream in the pool.
+   *
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @return Stream view.
+   */
+  virtual rmm::cuda_stream_view get_stream() = 0;
+
+  /**
+   * @brief Get a `cuda_stream_view` of the stream associated with `stream_id`.
+   *
+   * Equivalent values of `stream_id` return a `cuda_stream_view` to the same underlying stream.
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @param stream_id Unique identifier for the desired stream
+   * @return Requested stream view.
+   */
+  virtual rmm::cuda_stream_view get_stream(stream_id_type stream_id) = 0;
+
+  /**
+   * @brief Get a set of `cuda_stream_view` objects from the pool.
+   *
+   * An attempt is made to ensure that the returned vector does not contain duplicate
+   * streams, but this cannot be guaranteed if `count` is greater than the value returned by
+   * `get_stream_pool_size()`.
+   *
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @param count The number of stream views to return.
+   * @return Vector containing `count` stream views.
+   */
+  virtual std::vector<rmm::cuda_stream_view> get_streams(std::size_t count) = 0;
+
+  /**
+   * @brief Get the number of stream objects in the pool.
+   *
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @return the number of stream objects in the pool
+   */
+  virtual std::size_t get_stream_pool_size() const = 0;
+};
+
+/**
+ * @brief Implementation of `cuda_stream_pool` that wraps an `rmm::cuda_stram_pool`.
+ */
+class rmm_cuda_stream_pool : public cuda_stream_pool {
+  rmm::cuda_stream_pool _pool;
+
+ public:
+  rmm_cuda_stream_pool() : _pool{STREAM_POOL_SIZE} {}
+  rmm::cuda_stream_view get_stream() override { return _pool.get_stream(); }
+  rmm::cuda_stream_view get_stream(stream_id_type stream_id) override
+  {
+    return _pool.get_stream(stream_id);
+  }
+
+  std::vector<rmm::cuda_stream_view> get_streams(std::size_t count) override
+  {
+    if (count > STREAM_POOL_SIZE) {
+      CUDF_LOG_WARN("get_streams called with count ({}) > pool size ({})", count, STREAM_POOL_SIZE);
+    }
+    auto streams = std::vector<rmm::cuda_stream_view>();
+    for (uint32_t i = 0; i < count; i++) {
+      streams.emplace_back(_pool.get_stream());
+    }
+    return streams;
+  }
+
+  std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; }
+};
+
+/**
+ * @brief Implementation of `cuda_stream_pool` that always returns `cudf::get_default_stream()`
+ */
+class debug_cuda_stream_pool : public cuda_stream_pool {
+ public:
+  rmm::cuda_stream_view get_stream() override { return cudf::get_default_stream(); }
+  rmm::cuda_stream_view get_stream(stream_id_type stream_id) override
+  {
+    return cudf::get_default_stream();
+  }
+
+  std::vector<rmm::cuda_stream_view> get_streams(std::size_t count) override
+  {
+    return std::vector<rmm::cuda_stream_view>(count, cudf::get_default_stream());
+  }
+
+  std::size_t get_stream_pool_size() const override { return 1UL; }
+};
+
+/**
+ * @brief Initialize global stream pool.
+ */
+cuda_stream_pool* create_global_cuda_stream_pool()
+{
+  if (getenv("LIBCUDF_USE_DEBUG_STREAM_POOL")) return new debug_cuda_stream_pool();
+
+  return new rmm_cuda_stream_pool();
+}
+
+// FIXME: these will be available in rmm soon
+inline int get_num_cuda_devices()
+{
+  rmm::cuda_device_id::value_type num_dev{};
+  CUDF_CUDA_TRY(cudaGetDeviceCount(&num_dev));
+  return num_dev;
+}
+
+rmm::cuda_device_id get_current_cuda_device()
+{
+  int device_id;
+  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+  return rmm::cuda_device_id{device_id};
+}
+
+/**
+ * @brief RAII struct to wrap a cuda event and ensure its proper destruction.
+ */
+struct cuda_event {
+  cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
+  virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
+
+  operator cudaEvent_t() { return e_; }
+
+ private:
+  cudaEvent_t e_;
+};
+
+/**
+ * @brief Returns a cudaEvent_t for the current thread.
+ *
+ * The returned event is valid for the current device.
+ *
+ * @return A cudaEvent_t unique to the current thread and valid on the current device.
+ */
+cudaEvent_t event_for_thread()
+{
+  thread_local std::vector<std::unique_ptr<cuda_event>> thread_events(get_num_cuda_devices());
+  auto const device_id = get_current_cuda_device();
+  if (not thread_events[device_id.value()]) {
+    thread_events[device_id.value()] = std::make_unique<cuda_event>();
+  }
+  return *thread_events[device_id.value()];
+}
+
+/**
+ * @brief Returns a reference to the global stream pool for the current device.
+ * @return `cuda_stream_pool` valid on the current device.
+ */
+cuda_stream_pool& global_cuda_stream_pool()
+{
+  // using bare pointers here to deliberately allow them to leak. otherwise we wind up with
+  // seg faults trying to destroy stream objects after the context has shut down.
+  static std::vector<cuda_stream_pool*> pools(get_num_cuda_devices());
+  static std::mutex mutex;
+  auto const device_id = get_current_cuda_device();
+
+  std::lock_guard<std::mutex> lock(mutex);
+  if (pools[device_id.value()] == nullptr) {
+    pools[device_id.value()] = create_global_cuda_stream_pool();
+  }
+  return *pools[device_id.value()];
+}
+
+}  // anonymous namespace
+
+std::vector<rmm::cuda_stream_view> fork_streams(rmm::cuda_stream_view stream, std::size_t count)
+{
+  auto const streams = global_cuda_stream_pool().get_streams(count);
+  auto const event   = event_for_thread();
+  CUDF_CUDA_TRY(cudaEventRecord(event, stream));
+  std::for_each(streams.begin(), streams.end(), [&](auto& strm) {
+    CUDF_CUDA_TRY(cudaStreamWaitEvent(strm, event, 0));
+  });
+  return streams;
+}
+
+void join_streams(host_span<rmm::cuda_stream_view const> streams, rmm::cuda_stream_view stream)
+{
+  auto const event = event_for_thread();
+  std::for_each(streams.begin(), streams.end(), [&](auto& strm) {
+    CUDF_CUDA_TRY(cudaEventRecord(event, strm));
+    CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
+  });
+}
+
+}  // namespace cudf::detail

From c13b78309cc9f07ffde7e4794fdc04cb0a90a1ab Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 12 Sep 2023 20:19:59 -1000
Subject: [PATCH 169/230] Validate ignore_index type in drop_duplicates
 (#14098)

Currently allows odd behavior like

```python
In [1]: import cudf

In [4]: df = cudf.DataFrame({"a": [1, 2, 1, 3]})

In [6]: df.drop_duplicates(ignore_index="True")
Out[6]:
   a
0  1
1  2
2  3
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14098
---
 python/cudf/cudf/core/indexed_frame.py    | 5 +++++
 python/cudf/cudf/tests/test_duplicates.py | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 69b25c51a66..518262ae926 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1961,6 +1961,11 @@ def drop_duplicates(
         ignore_index: bool, default False
             If True, the resulting axis will be labeled 0, 1, ..., n - 1.
         """
+        if not isinstance(ignore_index, (np.bool_, bool)):
+            raise ValueError(
+                f"{ignore_index=} must be bool, "
+                f"not {type(ignore_index).__name__}"
+            )
         subset = self._preprocess_subset(subset)
         subset_cols = [name for name in self._column_names if name in subset]
         if len(subset_cols) == 0:
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index 8a83ec150bc..f77e7b4d775 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -623,3 +623,9 @@ def test_drop_duplicates_multi_index():
             gdf[col].drop_duplicates().to_pandas(),
             pdf[col].drop_duplicates(),
         )
+
+
+def test_drop_duplicates_ignore_index_wrong_type():
+    gdf = cudf.DataFrame([1, 1, 2])
+    with pytest.raises(ValueError):
+        gdf.drop_duplicates(ignore_index="True")

From 99c77111a20a2aea849d234ebe4c36171dc885fc Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 13 Sep 2023 01:31:35 -0500
Subject: [PATCH 170/230] Add support for `__round__` in `Series` and
 `DataFrame` (#14099)

Fixes: #14083

This PR fixes builtin function `round` call on `DataFrame` & `Series` to work by implementing `__round__` method.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14099
---
 python/cudf/cudf/core/indexed_frame.py   |  6 +++++
 python/cudf/cudf/tests/test_dataframe.py | 23 +++++++++++++++++++
 python/cudf/cudf/tests/test_series.py    | 29 ++++++++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 518262ae926..62e091b29b5 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -358,6 +358,12 @@ def _from_columns_like_self(
             override_dtypes=override_dtypes,
         )
 
+    def __round__(self, digits=0):
+        # Shouldn't be added to BinaryOperand
+        # because pandas Index doesn't implement
+        # this method.
+        return self.round(decimals=digits)
+
     def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Optional[Self]:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 44d0b9249d0..61372bab3ad 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10326,3 +10326,26 @@ def test_dataframe_nlargest_nsmallest_str_error(attr):
         ([], {"n": 1, "columns": ["a", "b"]}),
         ([], {"n": 1, "columns": ["a", "b"]}),
     )
+
+
+@pytest.mark.parametrize("digits", [0, 1, 3, 4, 10])
+def test_dataframe_round_builtin(digits):
+    pdf = pd.DataFrame(
+        {
+            "a": [1.2234242333234, 323432.3243423, np.nan],
+            "b": ["a", "b", "c"],
+            "c": pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"),
+            "d": pd.Series([224.242, None, 2424.234324], dtype="category"),
+            "e": [
+                decimal.Decimal("342.3243234234242"),
+                decimal.Decimal("89.32432497687622"),
+                None,
+            ],
+        }
+    )
+    gdf = cudf.from_pandas(pdf, nan_as_null=False)
+
+    expected = round(pdf, digits)
+    actual = round(gdf, digits)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8a652caa6e2..798809b0ada 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+import decimal
 import hashlib
 import operator
 import re
@@ -2282,3 +2283,31 @@ def test_series_rename(initial_name, name):
     expected = psr.rename(name)
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.2234242333234, 323432.3243423, np.nan],
+        pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"),
+        pd.Series([224.242, None, 2424.234324], dtype="category"),
+        [
+            decimal.Decimal("342.3243234234242"),
+            decimal.Decimal("89.32432497687622"),
+            None,
+        ],
+    ],
+)
+@pytest.mark.parametrize("digits", [0, 1, 3, 4, 10])
+def test_series_round_builtin(data, digits):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps, nan_as_null=False)
+
+    # TODO: Remove `to_frame` workaround
+    # after following issue is fixed:
+    # https://github.com/pandas-dev/pandas/issues/55114
+    expected = round(ps.to_frame(), digits)[0]
+    expected.name = None
+    actual = round(gs, digits)
+
+    assert_eq(expected, actual)

From 1668c2caac27c5c92dfeddb20271b835b36c5615 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 13 Sep 2023 12:09:10 -0400
Subject: [PATCH 171/230] Only use memory resources that haven't been freed
 (#14103)

Fixes #13859

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14103
---
 cpp/include/cudf_test/base_fixture.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 05319e03003..b622d7c6b78 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -392,6 +392,7 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
       auto adaptor                       = make_stream_checking_resource_adaptor(             \
         resource.get(), error_on_invalid_stream, check_default_stream); \
       rmm::mr::set_current_device_resource(&adaptor);                                         \
+      return RUN_ALL_TESTS();                                                                 \
     }                                                                                         \
                                                                                               \
     return RUN_ALL_TESTS();                                                                   \

From 60009a8005a8b9b69c2c870465b5cf46532d3388 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 13 Sep 2023 17:12:44 -0500
Subject: [PATCH 172/230] Fix naming issues with `Index.to_frame` and
 `MultiIndex.to_frame` APIs (#14105)

This PR:

- [x] Introduces `allow_duplicates` for parity with `MultiIndex.to_frame` - however this parameter is non-functional since cudf doesn't support duplicate column names.
- [x] Fixed handling of duplicate index names in `MultiIndex.to_frame`
- [x] Added proper docs for `Index.to_frame` & `MultiIndex.to_frame` separately due to change in API signature.
- [x] Added tests for `Index.to_frame` & `MultiIndex.to_frame`
- [x] Introduced deprecations that will go away when pandas-2.0 support is enabled.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14105
---
 python/cudf/cudf/core/_base_index.py      | 57 +++++++++++--
 python/cudf/cudf/core/multiindex.py       | 99 ++++++++++++++++++++---
 python/cudf/cudf/tests/test_index.py      | 19 +++++
 python/cudf/cudf/tests/test_multiindex.py | 83 +++++++++++++++++++
 4 files changed, 242 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 2f6e864b51c..c0bd9ec6eee 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -19,6 +19,7 @@
     drop_nulls,
 )
 from cudf._lib.types import size_type_dtype
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     is_bool_dtype,
     is_integer,
@@ -701,21 +702,65 @@ def fillna(self, value, downcast=None):
 
         return super().fillna(value=value)
 
-    def to_frame(self, index=True, name=None):
+    def to_frame(self, index=True, name=no_default):
         """Create a DataFrame with a column containing this Index
 
         Parameters
         ----------
         index : boolean, default True
             Set the index of the returned DataFrame as the original Index
-        name : str, default None
-            Name to be used for the column
+        name : object, defaults to index.name
+            The passed name should substitute for the index name (if it has
+            one).
+
         Returns
         -------
         DataFrame
-            cudf DataFrame
-        """
-        if name is not None:
+            DataFrame containing the original Index data.
+
+        See Also
+        --------
+        Index.to_series : Convert an Index to a Series.
+        Series.to_frame : Convert Series to DataFrame.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal')
+        >>> idx.to_frame()
+               animal
+        animal
+        Ant       Ant
+        Bear     Bear
+        Cow       Cow
+
+        By default, the original Index is reused. To enforce a new Index:
+
+        >>> idx.to_frame(index=False)
+            animal
+        0   Ant
+        1  Bear
+        2   Cow
+
+        To override the name of the resulting column, specify `name`:
+
+        >>> idx.to_frame(index=False, name='zoo')
+            zoo
+        0   Ant
+        1  Bear
+        2   Cow
+        """
+        if name is None:
+            warnings.warn(
+                "Explicitly passing `name=None` currently preserves "
+                "the Index's name or uses a default name of 0. This "
+                "behaviour is deprecated, and in the future `None` "
+                "will be used as the name of the "
+                "resulting DataFrame column.",
+                FutureWarning,
+            )
+            name = no_default
+        if name is not no_default:
             col_name = name
         elif self.name is None:
             col_name = 0
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index bc6726879c1..21380bb841c 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -20,6 +20,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import DataFrameOrSeries
+from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._compat import PANDAS_GE_150
@@ -1015,7 +1016,12 @@ def __getitem__(self, index):
         elif isinstance(index, slice):
             start, stop, step = index.indices(len(self))
             index = column.arange(start, stop, step)
-        result = MultiIndex.from_frame(self.to_frame(index=False).take(index))
+        result = MultiIndex.from_frame(
+            self.to_frame(index=False, name=range(0, self.nlevels)).take(
+                index
+            ),
+            names=self.names,
+        )
 
         # we are indexing into a single row of the MultiIndex,
         # return that row as a tuple:
@@ -1026,24 +1032,95 @@ def __getitem__(self, index):
             result._codes = self._codes.take(index)
         if self._levels is not None:
             result._levels = self._levels
-        result.names = self.names
         return result
 
     @_cudf_nvtx_annotate
-    def to_frame(self, index=True, name=None):
+    def to_frame(self, index=True, name=no_default, allow_duplicates=False):
+        """
+        Create a DataFrame with the levels of the MultiIndex as columns.
+
+        Column ordering is determined by the DataFrame constructor with data as
+        a dict.
+
+        Parameters
+        ----------
+        index : bool, default True
+            Set the index of the returned DataFrame as the original MultiIndex.
+        name : list / sequence of str, optional
+            The passed names should substitute index level names.
+        allow_duplicates : bool, optional default False
+            Allow duplicate column labels to be created. Note
+            that this parameter is non-functional because
+            duplicates column labels aren't supported in cudf.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> mi = cudf.MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
+        >>> mi
+        MultiIndex([('a', 'c'),
+                    ('b', 'd')],
+                   )
+
+        >>> df = mi.to_frame()
+        >>> df
+             0  1
+        a c  a  c
+        b d  b  d
+
+        >>> df = mi.to_frame(index=False)
+        >>> df
+           0  1
+        0  a  c
+        1  b  d
+
+        >>> df = mi.to_frame(name=['x', 'y'])
+        >>> df
+             x  y
+        a c  a  c
+        b d  b  d
+        """
         # TODO: Currently this function makes a shallow copy, which is
         # incorrect. We want to make a deep copy, otherwise further
         # modifications of the resulting DataFrame will affect the MultiIndex.
-        df = cudf.DataFrame._from_data(data=self._data)
-        if index:
-            df = df.set_index(self)
-        if name is not None:
+        if name is None:
+            warnings.warn(
+                "Explicitly passing `name=None` currently preserves the "
+                "Index's name or uses a default name of 0. This behaviour "
+                "is deprecated, and in the future `None` will be used "
+                "as the name of the resulting DataFrame column.",
+                FutureWarning,
+            )
+            name = no_default
+
+        if name is not no_default:
             if len(name) != len(self.levels):
                 raise ValueError(
                     "'name' should have the same length as "
                     "number of levels on index."
                 )
-            df.columns = name
+            column_names = name
+        else:
+            column_names = self.names
+        all_none_names = None
+        if not (
+            all_none_names := all(x is None for x in column_names)
+        ) and len(column_names) != len(set(column_names)):
+            raise ValueError("Duplicate column names are not allowed")
+        df = cudf.DataFrame._from_data(
+            data=self._data,
+            columns=column_names
+            if name is not no_default and not all_none_names
+            else None,
+        )
+
+        if index:
+            df = df.set_index(self)
+
         return df
 
     @_cudf_nvtx_annotate
@@ -1504,7 +1581,9 @@ def droplevel(self, level=-1):
 
     @_cudf_nvtx_annotate
     def to_pandas(self, nullable=False, **kwargs):
-        result = self.to_frame(index=False).to_pandas(nullable=nullable)
+        result = self.to_frame(
+            index=False, name=list(range(self.nlevels))
+        ).to_pandas(nullable=nullable)
         return pd.MultiIndex.from_frame(result, names=self.names)
 
     @classmethod
@@ -1623,7 +1702,7 @@ def _clean_nulls_from_index(self):
         Convert all na values(if any) in MultiIndex object
         to `<NA>` as a preprocessing step to `__repr__` methods.
         """
-        index_df = self.to_frame(index=False)
+        index_df = self.to_frame(index=False, name=list(range(self.nlevels)))
         return MultiIndex.from_frame(
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 6fb615c22e0..b3791cddce3 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -11,6 +11,7 @@
 import pytest
 
 import cudf
+from cudf.api.extensions import no_default
 from cudf.api.types import is_bool_dtype
 from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200
 from cudf.core.index import (
@@ -2777,3 +2778,21 @@ def test_index_empty_from_pandas(request, dtype):
     gidx = cudf.from_pandas(pidx)
 
     assert_eq(pidx, gidx)
+
+
+@pytest.mark.parametrize(
+    "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
+)
+@pytest.mark.parametrize("data_name", [None, 1, "abc"])
+@pytest.mark.parametrize("index", [True, False])
+@pytest.mark.parametrize("name", [None, no_default, 1, "abc"])
+def test_index_to_frame(data, data_name, index, name):
+    pidx = pd.Index(data, name=data_name)
+    gidx = cudf.from_pandas(pidx)
+
+    with expect_warning_if(name is None):
+        expected = pidx.to_frame(index=index, name=name)
+    with expect_warning_if(name is None):
+        actual = gidx.to_frame(index=index, name=name)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 3c843ace0a8..fb2b0c07efb 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -16,6 +16,7 @@
 import pytest
 
 import cudf
+from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_GE_200
 from cudf.core.column import as_column
 from cudf.core.index import as_index
@@ -1926,3 +1927,85 @@ def test_multiindex_to_series_error():
     midx = cudf.MultiIndex.from_tuples([("a", "b")])
     with pytest.raises(NotImplementedError):
         midx.to_series()
+
+
+@pytest.mark.parametrize(
+    "pidx",
+    [
+        pd.MultiIndex.from_arrays(
+            [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+            names=["a", "b", "c"],
+        ),
+        pd.MultiIndex.from_arrays(
+            [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+            names=["a", "a", "a"],
+        ),
+        pd.MultiIndex.from_arrays(
+            [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]]
+)
+@pytest.mark.parametrize("allow_duplicates", [True, False])
+@pytest.mark.parametrize("index", [True, False])
+def test_multiindex_to_frame_allow_duplicates(
+    pidx, name, allow_duplicates, index
+):
+    gidx = cudf.from_pandas(pidx)
+
+    if (
+        (
+            len(pidx.names) != len(set(pidx.names))
+            and not all(x is None for x in pidx.names)
+        )
+        and not allow_duplicates
+        and (name is None or name is no_default)
+    ):
+        assert_exceptions_equal(
+            pidx.to_frame,
+            gidx.to_frame,
+            lfunc_args_and_kwargs=(
+                [],
+                {
+                    "index": index,
+                    "name": name,
+                    "allow_duplicates": allow_duplicates,
+                },
+            ),
+            rfunc_args_and_kwargs=(
+                [],
+                {
+                    "index": index,
+                    "name": name,
+                    "allow_duplicates": allow_duplicates,
+                },
+            ),
+        )
+    else:
+        if (
+            len(pidx.names) != len(set(pidx.names))
+            and not all(x is None for x in pidx.names)
+            and not isinstance(name, list)
+        ) or (isinstance(name, list) and len(name) != len(set(name))):
+            # cudf doesn't have the ability to construct dataframes
+            # with duplicate column names
+            with expect_warning_if(name is None):
+                with pytest.raises(ValueError):
+                    gidx.to_frame(
+                        index=index,
+                        name=name,
+                        allow_duplicates=allow_duplicates,
+                    )
+        else:
+            with expect_warning_if(name is None):
+                expected = pidx.to_frame(
+                    index=index, name=name, allow_duplicates=allow_duplicates
+                )
+            with expect_warning_if(name is None):
+                actual = gidx.to_frame(
+                    index=index, name=name, allow_duplicates=allow_duplicates
+                )
+
+            assert_eq(expected, actual)

From edfef800d98491ee61b390645548f9223bbfb049 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 13 Sep 2023 16:54:45 -0700
Subject: [PATCH 173/230] Refactor `hash_reduce_by_row` (#14095)

This PR extracts `hash_reduce_by_row` function from `distinct_reduce.*` files. Previously, that function was designed specifically to work with `distinct` in stream compaction with `size_type` output. Now, it becomes more generic and can support more generic reduction operations and various output types.

No new functionality was added.

The changes in this work pave the way for implementing histogram/merge histogram aggregations, which also rely on hash-base reduction.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14095
---
 cpp/CMakeLists.txt                            |   2 +-
 .../cudf/detail/hash_reduce_by_row.cuh        | 167 ++++++++++++++++++
 cpp/src/stream_compaction/distinct.cu         |  28 +--
 cpp/src/stream_compaction/distinct_count.cu   |   4 +-
 cpp/src/stream_compaction/distinct_helpers.cu | 109 ++++++++++++
 ...stinct_reduce.cuh => distinct_helpers.hpp} |  12 +-
 cpp/src/stream_compaction/distinct_reduce.cu  | 150 ----------------
 .../stream_compaction_common.cuh              |  22 ---
 .../stream_compaction_common.hpp              |   5 -
 9 files changed, 299 insertions(+), 200 deletions(-)
 create mode 100644 cpp/include/cudf/detail/hash_reduce_by_row.cuh
 create mode 100644 cpp/src/stream_compaction/distinct_helpers.cu
 rename cpp/src/stream_compaction/{distinct_reduce.cuh => distinct_helpers.hpp} (92%)
 delete mode 100644 cpp/src/stream_compaction/distinct_reduce.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c37d05a21c7..900e9eed98e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -530,7 +530,7 @@ add_library(
   src/stream_compaction/apply_boolean_mask.cu
   src/stream_compaction/distinct.cu
   src/stream_compaction/distinct_count.cu
-  src/stream_compaction/distinct_reduce.cu
+  src/stream_compaction/distinct_helpers.cu
   src/stream_compaction/drop_nans.cu
   src/stream_compaction/drop_nulls.cu
   src/stream_compaction/stable_distinct.cu
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
new file mode 100644
index 00000000000..2d2b43f1d4a
--- /dev/null
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/uninitialized_fill.h>
+
+#include <cuco/static_map.cuh>
+
+namespace cudf::detail {
+
+using hash_map_type =
+  cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
+
+/**
+ * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
+ * rows that compared equal.
+ *
+ * TODO: We need to switch to use `static_reduction_map` when it is ready
+ * (https://github.com/NVIDIA/cuCollections/pull/98).
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
+struct reduce_by_row_fn_base {
+ protected:
+  MapView const d_map;
+  KeyHasher const d_hasher;
+  KeyEqual const d_equal;
+  OutputType* const d_output;
+
+  reduce_by_row_fn_base(MapView const& d_map,
+                        KeyHasher const& d_hasher,
+                        KeyEqual const& d_equal,
+                        OutputType* const d_output)
+    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
+  {
+  }
+
+  /**
+   * @brief Return a pointer to the output array at the given index.
+   *
+   * @param idx The access index
+   * @return A pointer to the given index in the output array
+   */
+  __device__ OutputType* get_output_ptr(size_type const idx) const
+  {
+    auto const iter = d_map.find(idx, d_hasher, d_equal);
+
+    if (iter != d_map.end()) {
+      // Only one (undetermined) index value of the duplicate rows could be inserted into the map.
+      // As such, looking up for all indices of duplicate rows always returns the same value.
+      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
+
+      // All duplicate rows will have concurrent access to this same output slot.
+      return &d_output[inserted_idx];
+    } else {
+      // All input `idx` values have been inserted into the map before.
+      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
+      // `d_equal(idx, idx) == false`.
+      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
+      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
+      // output slot.
+      return &d_output[idx];
+    }
+  }
+};
+
+/**
+ * @brief Perform a reduction on groups of rows that are compared equal.
+ *
+ * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
+ * equal. A hash table is used to find groups of equal rows.
+ *
+ * At the beginning of the operation, the entire output array is filled with a value given by
+ * the `init` parameter. Then, the reduction result for each row group is written into the output
+ * array at the index of an unspecified row in the group.
+ *
+ * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a
+ *         reduction functor derived from `reduce_by_row_fn_base`
+ * @tparam OutputType Type of the reduction results
+ * @param map The auxiliary map to perform reduction
+ * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
+ *        comparisons
+ * @param num_rows The number of all input rows
+ * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
+ * @param has_nested_columns Indicates whether the input table has any nested columns
+ * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN values in floating point column should be
+ *        considered equal.
+ * @param init The initial value for reduction of each row group
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A device_uvector containing the reduction results
+ */
+template <typename ReduceFuncBuilder, typename OutputType>
+rmm::device_uvector<OutputType> hash_reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  ReduceFuncBuilder func_builder,
+  OutputType init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto const map_dview  = map.get_device_view();
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const key_hasher = row_hasher.device_hasher(has_nulls);
+  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
+  thrust::uninitialized_fill(
+    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
+
+  auto const reduce_by_row = [&](auto const value_comp) {
+    if (has_nested_columns) {
+      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
+    } else {
+      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
+    }
+  };
+
+  if (nans_equal == nan_equality::ALL_EQUAL) {
+    using nan_equal_comparator =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+    reduce_by_row(nan_equal_comparator{});
+  } else {
+    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
+    reduce_by_row(nan_unequal_comparator{});
+  }
+
+  return reduction_results;
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index cc60b2a12ea..cc1e3423d42 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "distinct_reduce.cuh"
+#include "distinct_helpers.hpp"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/gather.hpp>
@@ -50,8 +50,8 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   auto map = hash_map_type{compute_hash_table_size(input.num_rows()),
-                           cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL},
-                           cuco::empty_value{COMPACTION_EMPTY_VALUE_SENTINEL},
+                           cuco::empty_key{-1},
+                           cuco::empty_value{std::numeric_limits<size_type>::min()},
                            detail::hash_table_allocator_type{default_allocator<char>{}, stream},
                            stream.value()};
 
@@ -61,7 +61,7 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+  auto const key_hasher = row_hasher.device_hasher(has_nulls);
 
   auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
@@ -96,16 +96,16 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = hash_reduce_by_row(map,
-                                                    std::move(preprocessed_input),
-                                                    input.num_rows(),
-                                                    has_nulls,
-                                                    has_nested_columns,
-                                                    keep,
-                                                    nulls_equal,
-                                                    nans_equal,
-                                                    stream,
-                                                    rmm::mr::get_current_device_resource());
+  auto const reduction_results = reduce_by_row(map,
+                                               std::move(preprocessed_input),
+                                               input.num_rows(),
+                                               has_nulls,
+                                               has_nested_columns,
+                                               keep,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 4bca0827efe..ac4811ad279 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -136,14 +136,14 @@ cudf::size_type distinct_count(table_view const& keys,
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const hash_key   = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+  auto const hash_key   = row_hasher.device_hasher(has_nulls);
   auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
   auto const comparator_helper = [&](auto const row_equal) {
     using hasher_type = decltype(hash_key);
     auto key_set      = cuco::experimental::static_set{
       cuco::experimental::extent{compute_hash_table_size(num_rows)},
-      cuco::empty_key<cudf::size_type>{COMPACTION_EMPTY_KEY_SENTINEL},
+      cuco::empty_key<cudf::size_type>{-1},
       row_equal,
       cuco::experimental::linear_probing<1, hasher_type>{hash_key},
       detail::hash_table_allocator_type{default_allocator<char>{}, stream},
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
new file mode 100644
index 00000000000..8f36ec98f4a
--- /dev/null
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distinct_helpers.hpp"
+
+#include <cudf/detail/hash_reduce_by_row.cuh>
+
+namespace cudf::detail {
+
+namespace {
+/**
+ * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual>
+struct reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
+  duplicate_keep_option const keep;
+
+  reduce_fn(MapView const& d_map,
+            KeyHasher const& d_hasher,
+            KeyEqual const& d_equal,
+            duplicate_keep_option const keep,
+            size_type* const d_output)
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
+                                                                     d_hasher,
+                                                                     d_equal,
+                                                                     d_output},
+      keep{keep}
+  {
+  }
+
+  __device__ void operator()(size_type const idx) const
+  {
+    auto const out_ptr = this->get_output_ptr(idx);
+
+    if (keep == duplicate_keep_option::KEEP_FIRST) {
+      // Store the smallest index of all rows that are equal.
+      atomicMin(out_ptr, idx);
+    } else if (keep == duplicate_keep_option::KEEP_LAST) {
+      // Store the greatest index of all rows that are equal.
+      atomicMax(out_ptr, idx);
+    } else {
+      // Count the number of rows in each group of rows that are compared equal.
+      atomicAdd(out_ptr, size_type{1});
+    }
+  }
+};
+
+/**
+ * @brief The builder to construct an instance of `reduce_fn` functor base on the given
+ * value of the `duplicate_keep_option` member variable.
+ */
+struct reduce_func_builder {
+  duplicate_keep_option const keep;
+
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             size_type* const d_output)
+  {
+    return reduce_fn<MapView, KeyHasher, KeyEqual>{d_map, d_hasher, d_equal, keep, d_output};
+  }
+};
+
+}  // namespace
+
+// This function is split from `distinct.cu` to improve compile time.
+rmm::device_uvector<size_type> reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  duplicate_keep_option keep,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
+               "This function should not be called with KEEP_ANY");
+
+  return hash_reduce_by_row(map,
+                            preprocessed_input,
+                            num_rows,
+                            has_nulls,
+                            has_nested_columns,
+                            nulls_equal,
+                            nans_equal,
+                            reduce_func_builder{keep},
+                            reduction_init_value(keep),
+                            stream,
+                            mr);
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_helpers.hpp
similarity index 92%
rename from cpp/src/stream_compaction/distinct_reduce.cuh
rename to cpp/src/stream_compaction/distinct_helpers.hpp
index 8ec1fa18205..b667d0b04f0 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cuh
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -14,18 +14,14 @@
  * limitations under the License.
  */
 
-#include "stream_compaction_common.cuh"
+#include "stream_compaction_common.hpp"
 
-#include <cudf/column/column_device_view.cuh>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <memory>
 
 namespace cudf::detail {
 
@@ -56,6 +52,8 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * - If `keep == KEEP_LAST`: max of row indices in the group.
  * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
  *
+ * Note that this function is not needed when `keep == KEEP_NONE`.
+ *
  * At the beginning of the operation, the entire output array is filled with a value given by
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
@@ -68,11 +66,13 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * @param has_nested_columns Indicates whether the input table has any nested columns
  * @param keep The parameter to determine what type of reduction to perform
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN values in floating point column should be
+ *        considered equal.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the reduction results
  */
-rmm::device_uvector<size_type> hash_reduce_by_row(
+rmm::device_uvector<size_type> reduce_by_row(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
deleted file mode 100644
index 020e6a495bc..00000000000
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distinct_reduce.cuh"
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace cudf::detail {
-
-namespace {
-/**
- * @brief A functor to perform reduce-by-key with keys are rows that compared equal.
- *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct reduce_by_row_fn {
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
-  duplicate_keep_option const keep;
-  size_type* const d_output;
-
-  reduce_by_row_fn(MapView const& d_map,
-                   KeyHasher const& d_hasher,
-                   KeyEqual const& d_equal,
-                   duplicate_keep_option const keep,
-                   size_type* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, keep{keep}, d_output{d_output}
-  {
-  }
-
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const out_ptr = get_output_ptr(idx);
-
-    if (keep == duplicate_keep_option::KEEP_FIRST) {
-      // Store the smallest index of all rows that are equal.
-      atomicMin(out_ptr, idx);
-    } else if (keep == duplicate_keep_option::KEEP_LAST) {
-      // Store the greatest index of all rows that are equal.
-      atomicMax(out_ptr, idx);
-    } else {
-      // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
-    }
-  }
-
- private:
-  __device__ size_type* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
-
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
-  }
-};
-
-}  // namespace
-
-rmm::device_uvector<size_type> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
-
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
-                             reduction_results.begin(),
-                             reduction_results.end(),
-                             reduction_init_value(keep));
-
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
index 4779cd990fd..839672d6a56 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.cuh
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -29,28 +29,6 @@
 namespace cudf {
 namespace detail {
 
-namespace experimental {
-
-/**
- * @brief Device callable to hash a given row.
- */
-template <typename RowHash>
-class compaction_hash {
- public:
-  compaction_hash(RowHash row_hasher) : _hash{row_hasher} {}
-
-  __device__ inline auto operator()(size_type i) const noexcept
-  {
-    auto hash = _hash(i);
-    return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash;
-  }
-
- private:
-  RowHash _hash;
-};
-
-}  // namespace experimental
-
 /**
 ￼ * @brief Device functor to determine if a row is valid.
 ￼ */
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
index 0cd2d8f4b14..58d958d2ff4 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -30,11 +30,6 @@
 namespace cudf {
 namespace detail {
 
-constexpr auto COMPACTION_EMPTY_KEY_SENTINEL   = std::numeric_limits<size_type>::max();
-constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits<size_type>::min();
-
-using hash_type = cuco::murmurhash3_32<size_type>;
-
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
 using hash_map_type =

From 664dfc33a29ddb86e671c19f12e2b56e32d46a8b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 13 Sep 2023 14:21:57 -1000
Subject: [PATCH 174/230] Raise NotImplementedError in to_datetime if Z (or tz
 component) in string (#14074)

closes #14039
Avoids this discrepancy when a date string has a tz component

```python
In [1]: import pandas

In [2]: import cudf

In [3]: data = ["2019-01-01T00:00:00.000Z"]

In [4]: cudf.to_datetime(data)
Out[4]: DatetimeIndex(['2019-01-01'], dtype='datetime64[ns]')

In [5]: pandas.to_datetime(data)
Out[5]: DatetimeIndex(['2019-01-01 00:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14074
---
 python/cudf/cudf/core/column/datetime.py | 15 +++++---
 python/cudf/cudf/tests/test_datetime.py  | 49 +++++++++++-------------
 python/cudf/cudf/tests/test_string.py    | 12 +++---
 3 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index da6c4fb858c..7775723e267 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -631,6 +631,10 @@ def infer_format(element: str, **kwargs) -> str:
     fmt = _guess_datetime_format(element, **kwargs)
 
     if fmt is not None:
+        if "%z" in fmt or "%Z" in fmt:
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
         return fmt
 
     element_parts = element.split(".")
@@ -651,11 +655,12 @@ def infer_format(element: str, **kwargs) -> str:
         raise ValueError("Unable to infer the timestamp format from the data")
 
     if len(second_parts) > 1:
-        # "Z" indicates Zulu time(widely used in aviation) - Which is
-        # UTC timezone that currently cudf only supports. Having any other
-        # unsupported timezone will let the code fail below
-        # with a ValueError.
-        second_parts.remove("Z")
+        # We may have a non-digit, timezone-like component
+        # like Z, UTC-3, +01:00
+        if any(re.search(r"\D", part) for part in second_parts):
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
         second_part = "".join(second_parts[1:])
 
         if len(second_part) > 1:
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 4c20258ae67..5cab19eedc6 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1250,40 +1250,31 @@ def test_datetime_reductions(data, op, dtype):
         assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize("timezone", ["naive", "UTC"])
 @pytest.mark.parametrize(
     "data",
     [
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"),
-            timezone="UTC",
-        ),
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"),
-            timezone="UTC",
-        ),
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"),
-            timezone="UTC",
-        ),
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"),
-            timezone="UTC",
-        ),
-        np.datetime_as_string(
-            np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"),
-            timezone="UTC",
-        ),
+        np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"),
+        np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"),
+        np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"),
+        np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"),
+        np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"),
     ],
 )
 @pytest.mark.parametrize("dtype", DATETIME_TYPES)
-def test_datetime_infer_format(data, dtype):
-    sr = cudf.Series(data)
-    psr = pd.Series(data)
+def test_datetime_infer_format(data, timezone, dtype):
+    ts_data = np.datetime_as_string(data, timezone=timezone)
+    sr = cudf.Series(ts_data)
+    if timezone == "naive":
+        psr = pd.Series(ts_data)
 
-    expected = psr.astype(dtype)
-    actual = sr.astype(dtype)
+        expected = psr.astype(dtype)
+        actual = sr.astype(dtype)
 
-    assert_eq(expected, actual)
+        assert_eq(expected, actual)
+    else:
+        with pytest.raises(NotImplementedError):
+            sr.astype(dtype)
 
 
 def test_dateoffset_instance_subclass_check():
@@ -2158,6 +2149,12 @@ def test_format_timezone_not_implemented(code):
         )
 
 
+@pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"])
+def test_no_format_timezone_not_implemented(tz):
+    with pytest.raises(NotImplementedError):
+        cudf.to_datetime([f"2020-01-01 00:00:00{tz}"])
+
+
 @pytest.mark.parametrize("arg", [True, False])
 def test_args_not_datetime_typerror(arg):
     with pytest.raises(TypeError):
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 2bddd93ccb8..d54027eb707 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -200,12 +200,12 @@ def test_string_astype(dtype):
         data = ["True", "False", "True", "False", "False"]
     elif dtype.startswith("datetime64"):
         data = [
-            "2019-06-04T00:00:00Z",
-            "2019-06-04T12:12:12Z",
-            "2019-06-03T00:00:00Z",
-            "2019-05-04T00:00:00Z",
-            "2018-06-04T00:00:00Z",
-            "1922-07-21T01:02:03Z",
+            "2019-06-04T00:00:00",
+            "2019-06-04T12:12:12",
+            "2019-06-03T00:00:00",
+            "2019-05-04T00:00:00",
+            "2018-06-04T00:00:00",
+            "1922-07-21T01:02:03",
         ]
     elif dtype == "str" or dtype == "object":
         data = ["ab", "cd", "ef", "gh", "ij"]

From 89557bb0efad2d32098ba86b78e4f4706e7fe88f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 13 Sep 2023 19:22:46 -0500
Subject: [PATCH 175/230] Allow `numeric_only=True` for reduction operations on
 numeric types (#14111)

Fixes: #14090
This PR allows passing `numeric_only=True` for reduction operation on numerical columns.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14111
---
 python/cudf/cudf/core/single_column_frame.py |  6 ++-
 python/cudf/cudf/tests/test_stats.py         | 44 ++++++++++----------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 7c019f0722c..6a56ab8f3a5 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -49,9 +49,11 @@ def _reduce(
         if level is not None:
             raise NotImplementedError("level parameter is not implemented yet")
 
-        if numeric_only:
+        if numeric_only and not isinstance(
+            self._column, cudf.core.column.numerical_base.NumericalBaseColumn
+        ):
             raise NotImplementedError(
-                f"Series.{op} does not implement numeric_only"
+                f"Series.{op} does not implement numeric_only."
             )
         try:
             return getattr(self._column, op)(**kwargs)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 6478fbaad95..463cdb8a7f4 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -247,30 +247,37 @@ def test_misc_quantiles(data, q):
     ],
 )
 @pytest.mark.parametrize("null_flag", [False, True])
-def test_kurtosis_series(data, null_flag):
+@pytest.mark.parametrize("numeric_only", [False, True])
+def test_kurtosis_series(data, null_flag, numeric_only):
     pdata = data.to_pandas()
 
     if null_flag and len(data) > 2:
         data.iloc[[0, 2]] = None
         pdata.iloc[[0, 2]] = None
 
-    got = data.kurtosis()
+    got = data.kurtosis(numeric_only=numeric_only)
     got = got if np.isscalar(got) else got.to_numpy()
-    expected = pdata.kurtosis()
+    expected = pdata.kurtosis(numeric_only=numeric_only)
     np.testing.assert_array_almost_equal(got, expected)
 
-    got = data.kurt()
+    got = data.kurt(numeric_only=numeric_only)
     got = got if np.isscalar(got) else got.to_numpy()
-    expected = pdata.kurt()
+    expected = pdata.kurt(numeric_only=numeric_only)
     np.testing.assert_array_almost_equal(got, expected)
 
-    got = data.kurt(numeric_only=False)
-    got = got if np.isscalar(got) else got.to_numpy()
-    expected = pdata.kurt(numeric_only=False)
-    np.testing.assert_array_almost_equal(got, expected)
 
-    with pytest.raises(NotImplementedError):
-        data.kurt(numeric_only=True)
+@pytest.mark.parametrize("op", ["skew", "kurt"])
+def test_kurt_skew_error(op):
+    gs = cudf.Series(["ab", "cd"])
+    ps = gs.to_pandas()
+
+    with pytest.raises(FutureWarning):
+        assert_exceptions_equal(
+            getattr(gs, op),
+            getattr(ps, op),
+            lfunc_args_and_kwargs=([], {"numeric_only": True}),
+            rfunc_args_and_kwargs=([], {"numeric_only": True}),
+        )
 
 
 @pytest.mark.parametrize(
@@ -290,26 +297,19 @@ def test_kurtosis_series(data, null_flag):
     ],
 )
 @pytest.mark.parametrize("null_flag", [False, True])
-def test_skew_series(data, null_flag):
+@pytest.mark.parametrize("numeric_only", [False, True])
+def test_skew_series(data, null_flag, numeric_only):
     pdata = data.to_pandas()
 
     if null_flag and len(data) > 2:
         data.iloc[[0, 2]] = None
         pdata.iloc[[0, 2]] = None
 
-    got = data.skew()
-    expected = pdata.skew()
+    got = data.skew(numeric_only=numeric_only)
+    expected = pdata.skew(numeric_only=numeric_only)
     got = got if np.isscalar(got) else got.to_numpy()
     np.testing.assert_array_almost_equal(got, expected)
 
-    got = data.skew(numeric_only=False)
-    expected = pdata.skew(numeric_only=False)
-    got = got if np.isscalar(got) else got.to_numpy()
-    np.testing.assert_array_almost_equal(got, expected)
-
-    with pytest.raises(NotImplementedError):
-        data.skew(numeric_only=True)
-
 
 @pytest.mark.parametrize("dtype", params_dtypes)
 @pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100])

From 1bfeee7575e137bc75741cb2caf015e55ecab2cd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 13 Sep 2023 14:23:14 -1000
Subject: [PATCH 176/230] Raise NotImplementedError for datetime strings with
 UTC offset (#14070)

Avoids e.g. DatetimeIndex(["2022-07-22 00:00:00+02:00"]) from dropping the +02:00 since timezones are not supported

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14070
---
 python/cudf/cudf/core/column/column.py  | 18 ++++++++++++++++--
 python/cudf/cudf/tests/test_datetime.py |  6 ++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 59ab3569814..d2e2f11a12e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2519,11 +2519,11 @@ def _construct_array(
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype
-        inferred_dtype = None
+        inferred_dtype = infer_dtype(arbitrary, skipna=False)
         if (
             dtype is None
             and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and (inferred_dtype := infer_dtype(arbitrary, skipna=False))
+            and inferred_dtype
             in (
                 "mixed",
                 "mixed-integer",
@@ -2533,6 +2533,20 @@ def _construct_array(
         if inferred_dtype == "interval":
             # Only way to construct an Interval column.
             return pd.array(arbitrary)
+        elif (
+            inferred_dtype == "string" and getattr(dtype, "kind", None) == "M"
+        ):
+            # We may have date-like strings with timezones
+            try:
+                pd_arbitrary = pd.to_datetime(arbitrary)
+                if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
+                    raise NotImplementedError(
+                        "cuDF does not yet support timezone-aware datetimes"
+                    )
+            except pd.errors.OutOfBoundsDatetime:
+                # https://github.com/pandas-dev/pandas/issues/55096
+                pass
+
         arbitrary = np.asarray(
             arbitrary,
             dtype=native_dtype
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 5cab19eedc6..0cc7112454c 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2141,6 +2141,12 @@ def test_daterange_pandas_compatibility():
     assert_eq(expected, actual)
 
 
+def test_strings_with_utc_offset_not_implemented():
+    with pytest.warns(DeprecationWarning, match="parsing timezone"):  # cupy
+        with pytest.raises(NotImplementedError):
+            DatetimeIndex(["2022-07-22 00:00:00+02:00"])
+
+
 @pytest.mark.parametrize("code", ["z", "Z"])
 def test_format_timezone_not_implemented(code):
     with pytest.raises(NotImplementedError):

From 3b691f4be744ff1155df3634cd334211e738e37d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 15 Sep 2023 10:03:52 -1000
Subject: [PATCH 177/230] Raise NotImplementedError in to_datetime with
 dayfirst without infer_format (#14058)

Raises a `NotImplementedError` to avoid this incorrect behavior (which seems to actually not be implemented)

```python
In [6]: cudf.to_datetime(["10-02-2014"], dayfirst=True)
Out[6]: DatetimeIndex(['2014-10-02'], dtype='datetime64[ns]')
```

closes https://github.com/rapidsai/cudf/issues/14042

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14058
---
 python/cudf/cudf/core/tools/datetimes.py | 11 +++----
 python/cudf/cudf/tests/test_datetime.py  | 38 +++++++++++++++++++-----
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index f736e055163..a3f4bacf206 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -353,15 +353,16 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
                 format=format,
             )
         else:
-            if infer_datetime_format and format is None:
+            if format is None:
+                if not infer_datetime_format and dayfirst:
+                    raise NotImplementedError(
+                        f"{dayfirst=} not implemented "
+                        f"when {format=} and {infer_datetime_format=}."
+                    )
                 format = column.datetime.infer_format(
                     element=col.element_indexing(0),
                     dayfirst=dayfirst,
                 )
-            elif format is None:
-                format = column.datetime.infer_format(
-                    element=col.element_indexing(0)
-                )
             return col.as_datetime_column(
                 dtype=_unit_dtype_map[unit],
                 format=format,
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 0cc7112454c..164856ed6f5 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -617,22 +617,44 @@ def test_datetime_dataframe():
 @pytest.mark.parametrize("infer_datetime_format", [True, False])
 def test_cudf_to_datetime(data, dayfirst, infer_datetime_format):
     pd_data = data
+    is_string_data = False
     if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
         gd_data = cudf.from_pandas(pd_data)
+        is_string_data = (
+            gd_data.ndim == 1
+            and not gd_data.empty
+            and gd_data.dtype.kind == "O"
+        )
     else:
         if type(pd_data).__module__ == np.__name__:
             gd_data = cp.array(pd_data)
         else:
             gd_data = pd_data
+            is_string_data = isinstance(gd_data, list) and isinstance(
+                next(iter(gd_data), None), str
+            )
 
-    expected = pd.to_datetime(
-        pd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format
-    )
-    actual = cudf.to_datetime(
-        gd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format
-    )
-
-    assert_eq(actual, expected)
+    if dayfirst and not infer_datetime_format and is_string_data:
+        # Note: pandas<2.0 also does not respect dayfirst=True correctly
+        # for object data
+        with pytest.raises(NotImplementedError):
+            cudf.to_datetime(
+                gd_data,
+                dayfirst=dayfirst,
+                infer_datetime_format=infer_datetime_format,
+            )
+    else:
+        expected = pd.to_datetime(
+            pd_data,
+            dayfirst=dayfirst,
+            infer_datetime_format=infer_datetime_format,
+        )
+        actual = cudf.to_datetime(
+            gd_data,
+            dayfirst=dayfirst,
+            infer_datetime_format=infer_datetime_format,
+        )
+        assert_eq(actual, expected)
 
 
 @pytest.mark.parametrize(

From 4ca568e764a3898bf619a221cdb91a9261df22bf Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 18 Sep 2023 09:00:39 -0500
Subject: [PATCH 178/230] Update pyarrow-related dispatch logic in dask_cudf
 (#14069)

Updates `dask_cudf` dispatch logic to avoid breakage from https://github.com/dask/dask/pull/10500.
Also removes stale `try`/`except` logic.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ray Douglass (https://github.com/raydouglass)
  - gpuCI (https://github.com/GPUtester)
  - Mike Wendt (https://github.com/mike-wendt)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14069
---
 python/dask_cudf/dask_cudf/backends.py        | 69 +++++++++----------
 .../dask_cudf/tests/test_dispatch.py          | 21 ++++--
 2 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 2470b4d50f1..e3f4f04eb85 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -20,11 +20,14 @@
 from dask.dataframe.dispatch import (
     categorical_dtype_dispatch,
     concat_dispatch,
+    from_pyarrow_table_dispatch,
     group_split_dispatch,
     grouper_dispatch,
     hash_object_dispatch,
     is_categorical_dtype_dispatch,
     make_meta_dispatch,
+    pyarrow_schema_dispatch,
+    to_pyarrow_table_dispatch,
     tolist_dispatch,
     union_categoricals_dispatch,
 )
@@ -317,16 +320,6 @@ def get_grouper_cudf(obj):
     return cudf.core.groupby.Grouper
 
 
-try:
-    from dask.dataframe.dispatch import pyarrow_schema_dispatch
-
-    @pyarrow_schema_dispatch.register((cudf.DataFrame,))
-    def get_pyarrow_schema_cudf(obj):
-        return obj.to_arrow().schema
-
-except ImportError:
-    pass
-
 try:
     try:
         from dask.array.dispatch import percentile_lookup
@@ -378,35 +371,37 @@ def percentile_cudf(a, q, interpolation="linear"):
 except ImportError:
     pass
 
-try:
-    # Requires dask>2023.6.0
-    from dask.dataframe.dispatch import (
-        from_pyarrow_table_dispatch,
-        to_pyarrow_table_dispatch,
-    )
 
-    @to_pyarrow_table_dispatch.register(cudf.DataFrame)
-    def _cudf_to_table(obj, preserve_index=True, **kwargs):
-        if kwargs:
-            warnings.warn(
-                "Ignoring the following arguments to "
-                f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
-            )
-        return obj.to_arrow(preserve_index=preserve_index)
-
-    @from_pyarrow_table_dispatch.register(cudf.DataFrame)
-    def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
-        # cudf ignores self_destruct.
-        kwargs.pop("self_destruct", None)
-        if kwargs:
-            warnings.warn(
-                f"Ignoring the following arguments to "
-                f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
-            )
-        return obj.from_arrow(table)
+@pyarrow_schema_dispatch.register((cudf.DataFrame,))
+def _get_pyarrow_schema_cudf(obj, preserve_index=True, **kwargs):
+    if kwargs:
+        warnings.warn(
+            "Ignoring the following arguments to "
+            f"`pyarrow_schema_dispatch`: {list(kwargs)}"
+        )
+    return meta_nonempty(obj).to_arrow(preserve_index=preserve_index).schema
 
-except ImportError:
-    pass
+
+@to_pyarrow_table_dispatch.register(cudf.DataFrame)
+def _cudf_to_table(obj, preserve_index=True, **kwargs):
+    if kwargs:
+        warnings.warn(
+            "Ignoring the following arguments to "
+            f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
+        )
+    return obj.to_arrow(preserve_index=preserve_index)
+
+
+@from_pyarrow_table_dispatch.register(cudf.DataFrame)
+def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
+    # cudf ignores self_destruct.
+    kwargs.pop("self_destruct", None)
+    if kwargs:
+        warnings.warn(
+            f"Ignoring the following arguments to "
+            f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
+        )
+    return obj.from_arrow(table)
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index 22cc0f161e2..cf49b1df4f4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -3,9 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
-from packaging import version
 
-import dask
 from dask.base import tokenize
 from dask.dataframe import assert_eq
 from dask.dataframe.methods import is_categorical_dtype
@@ -24,10 +22,6 @@ def test_is_categorical_dispatch():
     assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
 
 
-@pytest.mark.skipif(
-    version.parse(dask.__version__) <= version.parse("2023.6.0"),
-    reason="Pyarrow-conversion dispatch requires dask>2023.6.0",
-)
 def test_pyarrow_conversion_dispatch():
     from dask.dataframe.dispatch import (
         from_pyarrow_table_dispatch,
@@ -79,3 +73,18 @@ def test_deterministic_tokenize(index):
     df2 = df.set_index(["B", "C"], drop=False)
     assert tokenize(df) != tokenize(df2)
     assert tokenize(df2) == tokenize(df2)
+
+
+@pytest.mark.parametrize("preserve_index", [True, False])
+def test_pyarrow_schema_dispatch(preserve_index):
+    from dask.dataframe.dispatch import (
+        pyarrow_schema_dispatch,
+        to_pyarrow_table_dispatch,
+    )
+
+    df = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
+    df["d"] = cudf.Series(["cat", "dog"] * 5)
+    table = to_pyarrow_table_dispatch(df, preserve_index=preserve_index)
+    schema = pyarrow_schema_dispatch(df, preserve_index=preserve_index)
+
+    assert schema.equals(table.schema)

From 5935ef3ce26b1eb7136dcaa989a36b15071a9d0d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 18 Sep 2023 09:53:18 -0500
Subject: [PATCH 179/230] Drop `kwargs` from `Series.count` (#14106)

Fixes: #14089
This PR drops `kwargs` from `Series.count` method signature.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/14106
---
 python/cudf/cudf/core/series.py       | 2 +-
 python/cudf/cudf/tests/test_series.py | 6 ++++++
 python/dask_cudf/dask_cudf/core.py    | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f44a3123dd3..7692d3015f8 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2549,7 +2549,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
     # Stats
     #
     @_cudf_nvtx_annotate
-    def count(self, level=None, **kwargs):
+    def count(self, level=None):
         """
         Return number of non-NA/null observations in the Series
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 798809b0ada..b1e991106ee 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2311,3 +2311,9 @@ def test_series_round_builtin(data, digits):
     actual = round(gs, digits)
 
     assert_eq(expected, actual)
+
+
+def test_series_count_invalid_param():
+    s = cudf.Series([])
+    with pytest.raises(TypeError):
+        s.count(skipna=True)
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index d2858876fcd..5b37e6e825c 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -421,7 +421,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out):
 def _parallel_var(ddf, meta, skipna, split_every, out):
     def _local_var(x, skipna):
         if skipna:
-            n = x.count(skipna=skipna)
+            n = x.count()
             avg = x.mean(skipna=skipna)
         else:
             # Not skipping nulls, so might as well

From 8e081c015417c5a8d2a99f9db6bbc9a2c438e477 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 18 Sep 2023 12:51:08 -0500
Subject: [PATCH 180/230] Add support for nested dict in `DataFrame`
 constructor (#14119)

Fixes: #14096

This PR enables nested dict initialization support in `DataFrame` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14119
---
 python/cudf/cudf/core/dataframe.py       |  4 ++--
 python/cudf/cudf/tests/test_dataframe.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 5a3d25a08a7..4fc175512a0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -977,7 +977,7 @@ def _align_input_series_indices(data, index):
         input_series = [
             Series(val)
             for val in data.values()
-            if isinstance(val, (pd.Series, Series))
+            if isinstance(val, (pd.Series, Series, dict))
         ]
 
         if input_series:
@@ -994,7 +994,7 @@ def _align_input_series_indices(data, index):
                 index = aligned_input_series[0].index
 
             for name, val in data.items():
-                if isinstance(val, (pd.Series, Series)):
+                if isinstance(val, (pd.Series, Series, dict)):
                     data[name] = aligned_input_series.pop(0)
 
         return data, index
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 61372bab3ad..652bdbbee45 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10349,3 +10349,22 @@ def test_dataframe_round_builtin(digits):
     actual = round(gdf, digits)
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_init_from_nested_dict():
+    ordered_dict = OrderedDict(
+        [
+            ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])),
+            ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])),
+            ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])),
+        ]
+    )
+    pdf = pd.DataFrame(ordered_dict)
+    gdf = cudf.DataFrame(ordered_dict)
+
+    assert_eq(pdf, gdf)
+    regular_dict = {key: dict(value) for key, value in ordered_dict.items()}
+
+    pdf = pd.DataFrame(regular_dict)
+    gdf = cudf.DataFrame(regular_dict)
+    assert_eq(pdf, gdf)

From 4467066c952111c0131383784d3eb6bf3248f0ac Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 18 Sep 2023 12:51:53 -0500
Subject: [PATCH 181/230] Restrict iterables of `DataFrame`'s as input to
 `DataFrame` constructor (#14118)

Fixes: #14094
This PR raises an error when an iterates of `DataFrame`'s is detected in `DataFrame` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14118
---
 python/cudf/cudf/core/dataframe.py       | 11 ++++++-----
 python/cudf/cudf/tests/test_dataframe.py |  6 ++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4fc175512a0..84c16b71997 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -852,12 +852,13 @@ def _init_from_list_like(self, data, index=None, columns=None):
         elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval):
             data = DataFrame.from_pandas(pd.DataFrame(data))
             self._data = data._data
+        elif any(
+            not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data
+        ):
+            raise TypeError("Inputs should be an iterable or sequence.")
+        elif len(data) > 0 and not can_convert_to_column(data[0]):
+            raise ValueError("Must pass 2-d input.")
         else:
-            if any(
-                not isinstance(col, (abc.Iterable, abc.Sequence))
-                for col in data
-            ):
-                raise TypeError("Inputs should be an iterable or sequence.")
             if (
                 len(data) > 0
                 and columns is None
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 652bdbbee45..cbef9bfa2d8 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10260,6 +10260,12 @@ def __getitem__(self, key):
         cudf.DataFrame({"a": A()})
 
 
+def test_dataframe_constructor_dataframe_list():
+    df = cudf.DataFrame(range(2))
+    with pytest.raises(ValueError):
+        cudf.DataFrame([df])
+
+
 def test_dataframe_constructor_from_namedtuple():
     Point1 = namedtuple("Point1", ["a", "b", "c"])
     Point2 = namedtuple("Point1", ["x", "y"])

From 2acd3dfa9e859feb4d803d9446c89b80f10bd54a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 18 Sep 2023 14:10:14 -0700
Subject: [PATCH 182/230] Expand statistics support in ORC writer (#13848)

Closes #7087, closes #13793, closes #13899

This PR adds support for several cases and statistics types:
- sum statistics are included even when all elements are null (no minmax);
- sum statistics are included in double stats;
- minimum/maximum and minimumNanos/maximumNanos are included in timestamp stats;
- hasNull field is written for all columns.
- decimal statistics

Added tests for all supported stats.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/13848
---
 cpp/include/cudf/io/orc_metadata.hpp          |  10 +-
 .../detail/convert/fixed_point_to_string.cuh  |  80 +++++++++
 cpp/src/io/orc/orc.cpp                        |   4 +-
 cpp/src/io/orc/stats_enc.cu                   | 169 +++++++++++++-----
 cpp/src/io/parquet/page_enc.cu                |   4 +-
 .../statistics_type_identification.cuh        |  19 +-
 .../io/statistics/typed_statistics_chunk.cuh  |   2 +-
 .../strings/convert/convert_fixed_point.cu    |  54 +-----
 cpp/tests/io/orc_test.cpp                     | 109 +++++++++--
 python/cudf/cudf/tests/test_orc.py            |  60 ++++---
 10 files changed, 356 insertions(+), 155 deletions(-)
 create mode 100644 cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh

diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 623ee2e49fc..82d59803c25 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -111,10 +111,10 @@ struct string_statistics : minmax_statistics<std::string>, sum_statistics<int64_
 /**
  * @brief Statistics for boolean columns.
  *
- * The `count` array includes the count of `false` and `true` values.
+ * The `count` array contains the count of `true` values.
  */
 struct bucket_statistics {
-  std::vector<uint64_t> count;  ///< Count of `false` and `true` values
+  std::vector<uint64_t> count;  ///< count of `true` values
 };
 
 /**
@@ -141,8 +141,10 @@ using binary_statistics = sum_statistics<int64_t>;
  * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
  */
 struct timestamp_statistics : minmax_statistics<int64_t> {
-  std::optional<int64_t> minimum_utc;  ///< minimum in milliseconds
-  std::optional<int64_t> maximum_utc;  ///< maximum in milliseconds
+  std::optional<int64_t> minimum_utc;    ///< minimum in milliseconds
+  std::optional<int64_t> maximum_utc;    ///< maximum in milliseconds
+  std::optional<int32_t> minimum_nanos;  ///< nanoseconds part of the minimum
+  std::optional<int32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
 namespace orc {
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
new file mode 100644
index 00000000000..0ee26ec9ee2
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/detail/convert/int_to_string.cuh>
+
+namespace cudf::strings::detail {
+
+/**
+ * @brief Returns the number of digits in the given fixed point number.
+ *
+ * @param value The value of the fixed point number
+ * @param scale The scale of the fixed point number
+ * @return int32_t The number of digits required to represent the fixed point number
+ */
+__device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32_t scale)
+{
+  if (scale >= 0) return count_digits(value) + scale;
+
+  auto const abs_value = numeric::detail::abs(value);
+  auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
+  auto const fraction  = count_digits(abs_value % exp_ten);
+  auto const num_zeros = std::max(0, (-scale - fraction));
+  return static_cast<int32_t>(value < 0) +    // sign if negative
+         count_digits(abs_value / exp_ten) +  // integer
+         1 +                                  // decimal point
+         num_zeros +                          // zeros padding
+         fraction;                            // size of fraction
+}
+
+/**
+ * @brief Converts the given fixed point number to a string.
+ *
+ * Caller is responsible for ensuring that the output buffer is large enough. The required output
+ * buffer size can be obtained by calling `fixed_point_string_size`.
+ *
+ * @param value The value of the fixed point number
+ * @param scale The scale of the fixed point number
+ * @param out_ptr The pointer to the output string
+ */
+__device__ inline void fixed_point_to_string(__int128_t const& value, int32_t scale, char* out_ptr)
+{
+  if (scale >= 0) {
+    out_ptr += integer_to_string(value, out_ptr);
+    thrust::generate_n(thrust::seq, out_ptr, scale, []() { return '0'; });  // add zeros
+    return;
+  }
+
+  // scale < 0
+  // write format:   [-]integer.fraction
+  // where integer  = abs(value) / (10^abs(scale))
+  //       fraction = abs(value) % (10^abs(scale))
+  if (value < 0) *out_ptr++ = '-';  // add sign
+  auto const abs_value = numeric::detail::abs(value);
+  auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
+  auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
+
+  out_ptr += integer_to_string(abs_value / exp_ten, out_ptr);  // add the integer part
+  *out_ptr++ = '.';                                            // add decimal point
+
+  thrust::generate_n(thrust::seq, out_ptr, num_zeros, []() { return '0'; });  // add zeros
+  out_ptr += num_zeros;
+
+  integer_to_string(abs_value % exp_ten, out_ptr);  // add the fraction part
+}
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index fc50b7118be..bc399b75ef9 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -178,7 +178,9 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
   auto op = std::tuple(field_reader(1, s.minimum),
                        field_reader(2, s.maximum),
                        field_reader(3, s.minimum_utc),
-                       field_reader(4, s.maximum_utc));
+                       field_reader(4, s.maximum_utc),
+                       field_reader(5, s.minimum_nanos),
+                       field_reader(6, s.maximum_nanos));
   function_builder(s, maxlen, op);
 }
 
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 069841980c1..69d7ec95acd 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -16,15 +16,16 @@
 
 #include "orc_gpu.hpp"
 
-#include <cudf/io/orc_types.hpp>
 #include <io/utilities/block_utils.cuh>
 
+#include <cudf/io/orc_types.hpp>
+#include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
+
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::gpu {
+
+using strings::detail::fixed_point_string_size;
 
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
@@ -58,13 +59,14 @@ __global__ void __launch_bounds__(init_threads_per_block)
 constexpr unsigned int buffersize_reduction_dim = 32;
 constexpr unsigned int block_size        = buffersize_reduction_dim * buffersize_reduction_dim;
 constexpr unsigned int pb_fld_hdrlen     = 1;
-constexpr unsigned int pb_fld_hdrlen16   = 2;  // > 127-byte length
-constexpr unsigned int pb_fld_hdrlen32   = 5;  // > 16KB length
+constexpr unsigned int pb_fld_hdrlen32   = 5;
+constexpr unsigned int pb_fldlen_int32   = 5;
 constexpr unsigned int pb_fldlen_int64   = 10;
 constexpr unsigned int pb_fldlen_float64 = 8;
-constexpr unsigned int pb_fldlen_decimal = 40;  // Assume decimal2string fits in 40 characters
 constexpr unsigned int pb_fldlen_bucket1 = 1 + pb_fldlen_int64;
-constexpr unsigned int pb_fldlen_common  = 2 * pb_fld_hdrlen + pb_fldlen_int64;
+// statistics field number + number of values + has null
+constexpr unsigned int pb_fldlen_common =
+  pb_fld_hdrlen + (pb_fld_hdrlen + pb_fldlen_int64) + 2 * pb_fld_hdrlen;
 
 template <unsigned int block_size>
 __global__ void __launch_bounds__(block_size, 1)
@@ -87,21 +89,32 @@ __global__ void __launch_bounds__(block_size, 1)
         case dtype_int8:
         case dtype_int16:
         case dtype_int32:
-        case dtype_date32:
         case dtype_int64:
-        case dtype_timestamp64:
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_int64);
           break;
+        case dtype_date32:
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64);
+          break;
+        case dtype_timestamp64:
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) +
+                      2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          break;
         case dtype_float32:
         case dtype_float64:
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_float64);
           break;
         case dtype_decimal64:
-        case dtype_decimal128:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen16 + 3 * (pb_fld_hdrlen + pb_fldlen_decimal);
-          break;
+        case dtype_decimal128: {
+          auto const scale    = groups[idx].col_dtype.scale();
+          auto const min_size = fixed_point_string_size(chunks[idx].min_value.d128_val, scale);
+          auto const max_size = fixed_point_string_size(chunks[idx].max_value.d128_val, scale);
+          auto const sum_size = fixed_point_string_size(chunks[idx].sum.d128_val, scale);
+          // common + total field length + encoded string lengths + strings
+          stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) +
+                      min_size + max_size + sum_size;
+        } break;
         case dtype_string:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fldlen_int64) +
+          stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) +
                       chunks[idx].min_value.str_val.length + chunks[idx].max_value.str_val.length;
           break;
         case dtype_none: stats_len = pb_fldlen_common;
@@ -126,9 +139,6 @@ struct stats_state_s {
   statistics_chunk chunk;
   statistics_merge_group group;
   statistics_dtype stats_dtype;  //!< Statistics data type for this column
-  // ORC stats
-  uint64_t numberOfValues;
-  uint8_t hasNull;
 };
 
 /*
@@ -178,6 +188,15 @@ __device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, void const* by
   return p + len;
 }
 
+__device__ inline uint8_t* pb_put_decimal(
+  uint8_t* p, uint32_t id, __int128_t value, int32_t scale, int32_t len)
+{
+  p[0] = id * 8 + ProtofType::FIXEDLEN;
+  p    = pb_encode_uint(p + 1, len);
+  strings::detail::fixed_point_to_string(value, scale, reinterpret_cast<char*>(p));
+  return p + len;
+}
+
 // Protobuf field encoding for 64-bit raw encoding (double)
 __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* raw64)
 {
@@ -186,6 +205,15 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r
   return p + 9;
 }
 
+// Splits a nanosecond timestamp into milliseconds and nanoseconds
+__device__ std::pair<int64_t, int32_t> split_nanosecond_timestamp(int64_t nano_count)
+{
+  auto const ns           = cuda::std::chrono::nanoseconds(nano_count);
+  auto const ms_floor     = cuda::std::chrono::floor<cuda::std::chrono::milliseconds>(ns);
+  auto const ns_remainder = ns - ms_floor;
+  return {ms_floor.count(), ns_remainder.count()};
+}
+
 /**
  * @brief Encode statistics in ORC protobuf format
  *
@@ -228,12 +256,14 @@ __global__ void __launch_bounds__(encode_threads_per_block)
 
   // Encode and update actual bfr size
   if (idx < statistics_count && t == 0) {
-    s->chunk           = chunks[idx];
-    s->group           = groups[idx];
-    s->stats_dtype     = s->group.stats_dtype;
-    s->base            = blob_bfr + s->group.start_chunk;
-    s->end             = blob_bfr + s->group.start_chunk + s->group.num_chunks;
-    uint8_t* cur       = pb_put_uint(s->base, 1, s->chunk.non_nulls);
+    s->chunk       = chunks[idx];
+    s->group       = groups[idx];
+    s->stats_dtype = s->group.stats_dtype;
+    s->base        = blob_bfr + s->group.start_chunk;
+    s->end         = blob_bfr + s->group.start_chunk + s->group.num_chunks;
+    uint8_t* cur   = pb_put_uint(s->base, 1, s->chunk.non_nulls);
+    cur            = pb_put_uint(cur, 10, s->chunk.null_count != 0);  // hasNull (bool)
+
     uint8_t* fld_start = cur;
     switch (s->stats_dtype) {
       case dtype_int8:
@@ -265,11 +295,14 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional double maximum = 2;
         //  optional double sum = 3;
         // }
-        if (s->chunk.has_minmax) {
+        if (s->chunk.has_minmax || s->chunk.has_sum) {
           *cur = 3 * 8 + ProtofType::FIXEDLEN;
           cur += 2;
-          cur          = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val);
-          cur          = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val);
+          if (s->chunk.has_minmax) {
+            cur = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val);
+            cur = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val);
+          }
+          if (s->chunk.has_sum) { cur = pb_put_fixed64(cur, 3, &s->chunk.sum.fp_val); }
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
@@ -280,18 +313,25 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional string maximum = 2;
         //  optional sint64 sum = 3; // sum will store the total length of all strings
         // }
-        if (s->chunk.has_minmax && s->chunk.has_sum) {
-          uint32_t sz = (pb_put_int(cur, 3, s->chunk.sum.i_val) - cur) +
-                        (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) +
-                        (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) +
-                        s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length;
+        if (s->chunk.has_minmax || s->chunk.has_sum) {
+          uint32_t sz = 0;
+          if (s->chunk.has_minmax) {
+            sz += (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) +
+                  (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) +
+                  s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length;
+          }
+          if (s->chunk.has_sum) { sz += pb_put_int(cur, 3, s->chunk.sum.i_val) - cur; }
+
           cur[0] = 4 * 8 + ProtofType::FIXEDLEN;
           cur    = pb_encode_uint(cur + 1, sz);
-          cur    = pb_put_binary(
-            cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length);
-          cur = pb_put_binary(
-            cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length);
-          cur = pb_put_int(cur, 3, s->chunk.sum.i_val);
+
+          if (s->chunk.has_minmax) {
+            cur = pb_put_binary(
+              cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length);
+            cur = pb_put_binary(
+              cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length);
+          }
+          if (s->chunk.has_sum) { cur = pb_put_int(cur, 3, s->chunk.sum.i_val); }
         }
         break;
       case dtype_bool:
@@ -299,8 +339,9 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         // message BucketStatistics {
         //  repeated uint64 count = 1 [packed=true];
         // }
-        if (s->chunk.has_sum) {  // Sum is equal to the number of 'true' values
-          cur[0]       = 5 * 8 + ProtofType::FIXEDLEN;
+        if (s->chunk.has_sum) {
+          cur[0] = 5 * 8 + ProtofType::FIXEDLEN;
+          // count is equal to the number of 'true' values, despite what specs say
           cur          = pb_put_packed_uint(cur + 2, 1, s->chunk.sum.u_val);
           fld_start[1] = cur - (fld_start + 2);
         }
@@ -313,8 +354,33 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional string maximum = 2;
         //  optional string sum = 3;
         // }
-        if (s->chunk.has_minmax) {
-          // TODO: Decimal support (decimal min/max stored as strings)
+        if (s->chunk.has_minmax or s->chunk.has_sum) {
+          auto const scale = s->group.col_dtype.scale();
+
+          uint32_t sz = 0;
+          auto const min_size =
+            s->chunk.has_minmax ? fixed_point_string_size(s->chunk.min_value.d128_val, scale) : 0;
+          auto const max_size =
+            s->chunk.has_minmax ? fixed_point_string_size(s->chunk.max_value.d128_val, scale) : 0;
+          if (s->chunk.has_minmax) {
+            // encoded string lengths, plus the strings
+            sz += (pb_put_uint(cur, 1, min_size) - cur) + min_size +
+                  (pb_put_uint(cur, 1, max_size) - cur) + max_size;
+          }
+          auto const sum_size =
+            s->chunk.has_sum ? fixed_point_string_size(s->chunk.sum.d128_val, scale) : 0;
+          if (s->chunk.has_sum) { sz += (pb_put_uint(cur, 1, sum_size) - cur) + sum_size; }
+
+          cur[0] = 6 * 8 + ProtofType::FIXEDLEN;
+          cur    = pb_encode_uint(cur + 1, sz);
+
+          if (s->chunk.has_minmax) {
+            cur = pb_put_decimal(cur, 1, s->chunk.min_value.d128_val, scale, min_size);  //  minimum
+            cur = pb_put_decimal(cur, 2, s->chunk.max_value.d128_val, scale, max_size);  // maximum
+          }
+          if (s->chunk.has_sum) {
+            cur = pb_put_decimal(cur, 3, s->chunk.sum.d128_val, scale, sum_size);  // sum
+          }
         }
         break;
       case dtype_date32:
@@ -338,12 +404,24 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 maximum = 2;
         //  optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch
         //  optional sint64 maximumUtc = 4;
+        //  optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond
+        //  precision optional int32 maximumNanos = 6;
         // }
         if (s->chunk.has_minmax) {
           cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
           cur += 2;
-          cur          = pb_put_int(cur, 3, s->chunk.min_value.i_val);  // minimumUtc
-          cur          = pb_put_int(cur, 4, s->chunk.max_value.i_val);  // maximumUtc
+          auto const [min_ms, min_ns_remainder] =
+            split_nanosecond_timestamp(s->chunk.min_value.i_val);
+          auto const [max_ms, max_ns_remainder] =
+            split_nanosecond_timestamp(s->chunk.max_value.i_val);
+
+          // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC
+          cur          = pb_put_int(cur, 1, min_ms);            // minimum
+          cur          = pb_put_int(cur, 2, max_ms);            // maximum
+          cur          = pb_put_int(cur, 3, min_ms);            // minimumUtc
+          cur          = pb_put_int(cur, 4, max_ms);            // maximumUtc
+          cur          = pb_put_int(cur, 5, min_ns_remainder);  // minimumNanos
+          cur          = pb_put_int(cur, 6, max_ns_remainder);  // maximumNanos
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
@@ -403,7 +481,4 @@ void orc_encode_statistics(uint8_t* blob_bfr,
     blob_bfr, groups, chunks, statistics_count);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::gpu
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 0af561be8da..fe0dbb85124 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1858,8 +1858,8 @@ __device__ std::pair<void const*, uint32_t> get_extremum(statistics_val const* s
     }
     case dtype_int64:
     case dtype_timestamp64:
-    case dtype_float64:
-    case dtype_decimal64: return {stats_val, sizeof(int64_t)};
+    case dtype_float64: return {stats_val, sizeof(int64_t)};
+    case dtype_decimal64:
     case dtype_decimal128:
       byte_reverse128(stats_val->d128_val, scratch);
       return {scratch, sizeof(__int128_t)};
diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh
index 32931d7d34d..ea8c71f0dcb 100644
--- a/cpp/src/io/statistics/statistics_type_identification.cuh
+++ b/cpp/src/io/statistics/statistics_type_identification.cuh
@@ -49,15 +49,15 @@ enum class is_int96_timestamp { YES, NO };
 template <io_file_format IO, is_int96_timestamp INT96>
 struct conversion_map;
 
-// Every timestamp or duration type is converted to milliseconds in ORC statistics
+// Every timestamp or duration type is converted to nanoseconds in ORC statistics
 template <is_int96_timestamp INT96>
 struct conversion_map<io_file_format::ORC, INT96> {
-  using types = std::tuple<std::pair<cudf::timestamp_s, cudf::timestamp_ms>,
-                           std::pair<cudf::timestamp_us, cudf::timestamp_ms>,
-                           std::pair<cudf::timestamp_ns, cudf::timestamp_ms>,
-                           std::pair<cudf::duration_s, cudf::duration_ms>,
-                           std::pair<cudf::duration_us, cudf::duration_ms>,
-                           std::pair<cudf::duration_ns, cudf::duration_ms>>;
+  using types = std::tuple<std::pair<cudf::timestamp_s, cudf::timestamp_ns>,
+                           std::pair<cudf::timestamp_us, cudf::timestamp_ns>,
+                           std::pair<cudf::timestamp_ns, cudf::timestamp_ns>,
+                           std::pair<cudf::duration_s, cudf::duration_ns>,
+                           std::pair<cudf::duration_us, cudf::duration_ns>,
+                           std::pair<cudf::duration_ns, cudf::duration_ns>>;
 };
 
 // In Parquet timestamps and durations with second resolution are converted to
@@ -125,7 +125,7 @@ class extrema_type {
 
   using non_arithmetic_extrema_type = typename std::conditional_t<
     cudf::is_fixed_point<T>() or cudf::is_duration<T>() or cudf::is_timestamp<T>(),
-    typename std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, int64_t>,
+    typename std::conditional_t<cudf::is_fixed_point<T>(), __int128_t, int64_t>,
     typename std::conditional_t<
       std::is_same_v<T, string_view>,
       string_view,
@@ -134,8 +134,7 @@ class extrema_type {
   // unsigned int/bool -> uint64_t
   // signed int        -> int64_t
   // float/double      -> double
-  // decimal32/64      -> int64_t
-  // decimal128        -> __int128_t
+  // decimal32/64/128  -> __int128_t
   // duration_[T]      -> int64_t
   // string_view       -> string_view
   // byte_array_view   -> byte_array_view
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index d007209a12a..e6ec1471cb7 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -244,9 +244,9 @@ get_untyped_chunk(typed_statistics_chunk<T, include_aggregate> const& chunk)
   stat.null_count = chunk.null_count;
   stat.has_minmax = chunk.has_minmax;
   stat.has_sum    = [&]() {
-    if (!chunk.has_minmax) return false;
     // invalidate the sum if overflow or underflow is possible
     if constexpr (std::is_floating_point_v<E> or std::is_integral_v<E>) {
+      if (!chunk.has_minmax) { return true; }
       return std::numeric_limits<E>::max() / chunk.non_nulls >=
                static_cast<E>(chunk.maximum_value) and
              std::numeric_limits<E>::lowest() / chunk.non_nulls <=
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index a3336258d3e..51aab9faeba 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/detail/convert/fixed_point.cuh>
-#include <cudf/strings/detail/convert/int_to_string.cuh>
+#include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -200,62 +200,19 @@ struct from_fixed_point_fn {
   size_type* d_offsets{};
   char* d_chars{};
 
-  /**
-   * @brief Calculates the size of the string required to convert the element, in base-10 format.
-   *
-   * Output format is [-]integer.fraction
-   */
-  __device__ int32_t compute_output_size(DecimalType value)
-  {
-    auto const scale = d_decimals.type().scale();
-
-    if (scale >= 0) return count_digits(value) + scale;
-
-    auto const abs_value = numeric::detail::abs(value);
-    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
-    auto const fraction  = count_digits(abs_value % exp_ten);
-    auto const num_zeros = std::max(0, (-scale - fraction));
-    return static_cast<int32_t>(value < 0) +    // sign if negative
-           count_digits(abs_value / exp_ten) +  // integer
-           1 +                                  // decimal point
-           num_zeros +                          // zeros padding
-           fraction;                            // size of fraction
-  }
-
   /**
    * @brief Converts a decimal element into a string.
    *
    * The value is converted into base-10 digits [0-9]
    * plus the decimal point and a negative sign prefix.
    */
-  __device__ void decimal_to_string(size_type idx)
+  __device__ void fixed_point_element_to_string(size_type idx)
   {
     auto const value = d_decimals.element<DecimalType>(idx);
     auto const scale = d_decimals.type().scale();
     char* d_buffer   = d_chars + d_offsets[idx];
 
-    if (scale >= 0) {
-      d_buffer += integer_to_string(value, d_buffer);
-      thrust::generate_n(thrust::seq, d_buffer, scale, []() { return '0'; });  // add zeros
-      return;
-    }
-
-    // scale < 0
-    // write format:   [-]integer.fraction
-    // where integer  = abs(value) / (10^abs(scale))
-    //       fraction = abs(value) % (10^abs(scale))
-    if (value < 0) *d_buffer++ = '-';  // add sign
-    auto const abs_value = numeric::detail::abs(value);
-    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
-    auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
-
-    d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
-    *d_buffer++ = '.';                                             // add decimal point
-
-    thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; });  // add zeros
-    d_buffer += num_zeros;
-
-    integer_to_string(abs_value % exp_ten, d_buffer);  // add the fraction part
+    fixed_point_to_string(value, scale, d_buffer);
   }
 
   __device__ void operator()(size_type idx)
@@ -265,9 +222,10 @@ struct from_fixed_point_fn {
       return;
     }
     if (d_chars != nullptr) {
-      decimal_to_string(idx);
+      fixed_point_element_to_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_decimals.element<DecimalType>(idx));
+      d_offsets[idx] =
+        fixed_point_string_size(d_decimals.element<DecimalType>(idx), d_decimals.type().scale());
     }
   }
 };
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index cff7b1cf081..890ef914713 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -976,6 +976,10 @@ TEST_F(OrcReaderTest, CombinedSkipRowTest)
 TEST_F(OrcStatisticsTest, Basic)
 {
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto ts_sequence =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i - 4) * 1000002; });
+  auto dec_sequence =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1001; });
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
   std::vector<char const*> strings{
@@ -986,11 +990,17 @@ TEST_F(OrcStatisticsTest, Basic)
     sequence, sequence + num_rows, validity);
   column_wrapper<float, typename decltype(sequence)::value_type> col2(
     sequence, sequence + num_rows, validity);
-  column_wrapper<cudf::string_view> col3{strings.begin(), strings.end()};
-  column_wrapper<bool, typename decltype(sequence)::value_type> col4(sequence, sequence + num_rows);
-  column_wrapper<cudf::timestamp_s, typename decltype(sequence)::value_type> col5(
-    sequence, sequence + num_rows, validity);
-  table_view expected({col1, col2, col3, col4, col5});
+  str_col col3{strings.begin(), strings.end()};
+  column_wrapper<cudf::timestamp_ns, typename decltype(sequence)::value_type> col4(
+    ts_sequence, ts_sequence + num_rows, validity);
+  column_wrapper<cudf::timestamp_us, typename decltype(sequence)::value_type> col5(
+    ts_sequence, ts_sequence + num_rows, validity);
+  bool_col col6({true, true, true, true, true, false, false, false, false}, validity);
+
+  cudf::test::fixed_point_column_wrapper<int64_t> col7(
+    dec_sequence, dec_sequence + num_rows, numeric::scale_type{-1});
+
+  table_view expected({col1, col2, col3, col4, col5, col6, col7});
 
   auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
 
@@ -1000,16 +1010,21 @@ TEST_F(OrcStatisticsTest, Basic)
 
   auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
 
-  auto const expected_column_names =
-    std::vector<std::string>{"", "_col0", "_col1", "_col2", "_col3", "_col4"};
+  auto expected_column_names = std::vector<std::string>{""};
+  std::generate_n(
+    std::back_inserter(expected_column_names),
+    expected.num_columns(),
+    [starting_index = 0]() mutable { return "_col" + std::to_string(starting_index++); });
   EXPECT_EQ(stats.column_names, expected_column_names);
 
   auto validate_statistics = [&](std::vector<cudf::io::column_statistics> const& stats) {
+    ASSERT_EQ(stats.size(), expected.num_columns() + 1);
     auto& s0 = stats[0];
     EXPECT_EQ(*s0.number_of_values, 9ul);
 
     auto& s1 = stats[1];
     EXPECT_EQ(*s1.number_of_values, 4ul);
+    EXPECT_TRUE(*s1.has_null);
     auto& ts1 = std::get<cudf::io::integer_statistics>(s1.type_specific_stats);
     EXPECT_EQ(*ts1.minimum, 1);
     EXPECT_EQ(*ts1.maximum, 7);
@@ -1017,30 +1032,55 @@ TEST_F(OrcStatisticsTest, Basic)
 
     auto& s2 = stats[2];
     EXPECT_EQ(*s2.number_of_values, 4ul);
+    EXPECT_TRUE(*s2.has_null);
     auto& ts2 = std::get<cudf::io::double_statistics>(s2.type_specific_stats);
     EXPECT_EQ(*ts2.minimum, 1.);
     EXPECT_EQ(*ts2.maximum, 7.);
-    // No sum ATM, filed #7087
-    ASSERT_FALSE(ts2.sum);
+    EXPECT_EQ(*ts2.sum, 16.);
 
     auto& s3 = stats[3];
     EXPECT_EQ(*s3.number_of_values, 9ul);
+    EXPECT_FALSE(*s3.has_null);
     auto& ts3 = std::get<cudf::io::string_statistics>(s3.type_specific_stats);
     EXPECT_EQ(*ts3.minimum, "Friday");
     EXPECT_EQ(*ts3.maximum, "Wednesday");
     EXPECT_EQ(*ts3.sum, 58ul);
 
     auto& s4 = stats[4];
-    EXPECT_EQ(*s4.number_of_values, 9ul);
-    EXPECT_EQ(std::get<cudf::io::bucket_statistics>(s4.type_specific_stats).count[0], 8ul);
+    EXPECT_EQ(*s4.number_of_values, 4ul);
+    EXPECT_TRUE(*s4.has_null);
+    auto& ts4 = std::get<cudf::io::timestamp_statistics>(s4.type_specific_stats);
+    EXPECT_EQ(*ts4.minimum, -4);
+    EXPECT_EQ(*ts4.maximum, 3);
+    EXPECT_EQ(*ts4.minimum_utc, -4);
+    EXPECT_EQ(*ts4.maximum_utc, 3);
+    EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_EQ(*ts4.maximum_nanos, 6);
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
+    EXPECT_TRUE(*s5.has_null);
     auto& ts5 = std::get<cudf::io::timestamp_statistics>(s5.type_specific_stats);
-    EXPECT_EQ(*ts5.minimum_utc, 1000);
-    EXPECT_EQ(*ts5.maximum_utc, 7000);
-    ASSERT_FALSE(ts5.minimum);
-    ASSERT_FALSE(ts5.maximum);
+    EXPECT_EQ(*ts5.minimum, -3001);
+    EXPECT_EQ(*ts5.maximum, 3000);
+    EXPECT_EQ(*ts5.minimum_utc, -3001);
+    EXPECT_EQ(*ts5.maximum_utc, 3000);
+    EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_EQ(*ts5.maximum_nanos, 6000);
+
+    auto& s6 = stats[6];
+    EXPECT_EQ(*s6.number_of_values, 4ul);
+    EXPECT_TRUE(*s6.has_null);
+    auto& ts6 = std::get<cudf::io::bucket_statistics>(s6.type_specific_stats);
+    EXPECT_EQ(ts6.count[0], 2);
+
+    auto& s7 = stats[7];
+    EXPECT_EQ(*s7.number_of_values, 9ul);
+    EXPECT_FALSE(*s7.has_null);
+    auto& ts7 = std::get<cudf::io::decimal_statistics>(s7.type_specific_stats);
+    EXPECT_EQ(*ts7.minimum, "0.0");
+    EXPECT_EQ(*ts7.maximum, "800.8");
+    EXPECT_EQ(*ts7.sum, "3603.6");
   };
 
   validate_statistics(stats.file_stats);
@@ -1259,9 +1299,8 @@ TEST_F(OrcStatisticsTest, Overflow)
 
 TEST_F(OrcStatisticsTest, HasNull)
 {
-  // cudf's ORC writer doesn't yet support the ability to encode the hasNull value in statistics so
-  // we're embedding a file created using pyorc
-  //
+  // This test can now be implemented with libcudf; keeping the pyorc version to keep the test
+  // inputs diversified
   // Method to create file:
   // >>> import pyorc
   // >>> output = open("./temp.orc", "wb")
@@ -1861,4 +1900,38 @@ TEST_F(OrcWriterTest, EmptyChildStringColumn)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+template <typename T>
+void check_all_null_stats(cudf::io::column_statistics const& stats)
+{
+  EXPECT_EQ(stats.number_of_values, 0);
+  EXPECT_TRUE(stats.has_null);
+
+  auto const ts = std::get<T>(stats.type_specific_stats);
+  EXPECT_FALSE(ts.minimum.has_value());
+  EXPECT_FALSE(ts.maximum.has_value());
+  EXPECT_TRUE(ts.sum.has_value());
+  EXPECT_EQ(*ts.sum, 0);
+}
+
+TEST_F(OrcStatisticsTest, AllNulls)
+{
+  float64_col double_col({0., 0., 0.}, cudf::test::iterators::all_nulls());
+  int32_col int_col({0, 0, 0}, cudf::test::iterators::all_nulls());
+  str_col string_col({"", "", ""}, cudf::test::iterators::all_nulls());
+
+  cudf::table_view expected({int_col, double_col, string_col});
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected);
+  cudf::io::write_orc(out_opts);
+
+  auto const stats = cudf::io::read_parsed_orc_statistics(
+    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
+
+  check_all_null_stats<cudf::io::integer_statistics>(stats.file_stats[1]);
+  check_all_null_stats<cudf::io::double_statistics>(stats.file_stats[2]);
+  check_all_null_stats<cudf::io::string_statistics>(stats.file_stats[3]);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index aafc8831bf4..07aa5430f4f 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -633,16 +633,19 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     for col in gdf:
         if "minimum" in file_stats[0][col]:
             stats_min = file_stats[0][col]["minimum"]
-            actual_min = gdf[col].min()
-            assert normalized_equals(actual_min, stats_min)
+            if stats_min is not None:
+                actual_min = gdf[col].min()
+                assert normalized_equals(actual_min, stats_min)
         if "maximum" in file_stats[0][col]:
             stats_max = file_stats[0][col]["maximum"]
-            actual_max = gdf[col].max()
-            assert normalized_equals(actual_max, stats_max)
+            if stats_max is not None:
+                actual_max = gdf[col].max()
+                assert normalized_equals(actual_max, stats_max)
         if "number_of_values" in file_stats[0][col]:
             stats_num_vals = file_stats[0][col]["number_of_values"]
-            actual_num_vals = gdf[col].count()
-            assert stats_num_vals == actual_num_vals
+            if stats_num_vals is not None:
+                actual_num_vals = gdf[col].count()
+                assert stats_num_vals == actual_num_vals
 
     # compare stripe statistics with actual min/max
     for stripe_idx in range(0, orc_file.nstripes):
@@ -651,21 +654,24 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
         stripe_df = cudf.DataFrame(stripe.to_pandas())
         for col in stripe_df:
             if "minimum" in stripes_stats[stripe_idx][col]:
-                actual_min = stripe_df[col].min()
                 stats_min = stripes_stats[stripe_idx][col]["minimum"]
-                assert normalized_equals(actual_min, stats_min)
+                if stats_min is not None:
+                    actual_min = stripe_df[col].min()
+                    assert normalized_equals(actual_min, stats_min)
 
             if "maximum" in stripes_stats[stripe_idx][col]:
-                actual_max = stripe_df[col].max()
                 stats_max = stripes_stats[stripe_idx][col]["maximum"]
-                assert normalized_equals(actual_max, stats_max)
+                if stats_max is not None:
+                    actual_max = stripe_df[col].max()
+                    assert normalized_equals(actual_max, stats_max)
 
             if "number_of_values" in stripes_stats[stripe_idx][col]:
                 stats_num_vals = stripes_stats[stripe_idx][col][
                     "number_of_values"
                 ]
-                actual_num_vals = stripe_df[col].count()
-                assert stats_num_vals == actual_num_vals
+                if stats_num_vals is not None:
+                    actual_num_vals = stripe_df[col].count()
+                    assert stats_num_vals == actual_num_vals
 
 
 @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
@@ -733,16 +739,19 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     for col in expect:
         if "minimum" in file_stats[0][col]:
             stats_min = file_stats[0][col]["minimum"]
-            actual_min = expect[col].min()
-            assert normalized_equals(actual_min, stats_min)
+            if stats_min is not None:
+                actual_min = expect[col].min()
+                assert normalized_equals(actual_min, stats_min)
         if "maximum" in file_stats[0][col]:
             stats_max = file_stats[0][col]["maximum"]
-            actual_max = expect[col].max()
-            assert normalized_equals(actual_max, stats_max)
+            if stats_max is not None:
+                actual_max = expect[col].max()
+                assert normalized_equals(actual_max, stats_max)
         if "number_of_values" in file_stats[0][col]:
             stats_num_vals = file_stats[0][col]["number_of_values"]
-            actual_num_vals = expect[col].count()
-            assert stats_num_vals == actual_num_vals
+            if stats_num_vals is not None:
+                actual_num_vals = expect[col].count()
+                assert stats_num_vals == actual_num_vals
 
     # compare stripe statistics with actual min/max
     for stripe_idx in range(0, orc_file.nstripes):
@@ -751,21 +760,24 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
         stripe_df = cudf.DataFrame(stripe.to_pandas())
         for col in stripe_df:
             if "minimum" in stripes_stats[stripe_idx][col]:
-                actual_min = stripe_df[col].min()
                 stats_min = stripes_stats[stripe_idx][col]["minimum"]
-                assert normalized_equals(actual_min, stats_min)
+                if stats_min is not None:
+                    actual_min = stripe_df[col].min()
+                    assert normalized_equals(actual_min, stats_min)
 
             if "maximum" in stripes_stats[stripe_idx][col]:
-                actual_max = stripe_df[col].max()
                 stats_max = stripes_stats[stripe_idx][col]["maximum"]
-                assert normalized_equals(actual_max, stats_max)
+                if stats_max is not None:
+                    actual_max = stripe_df[col].max()
+                    assert normalized_equals(actual_max, stats_max)
 
             if "number_of_values" in stripes_stats[stripe_idx][col]:
                 stats_num_vals = stripes_stats[stripe_idx][col][
                     "number_of_values"
                 ]
-                actual_num_vals = stripe_df[col].count()
-                assert stats_num_vals == actual_num_vals
+                if stats_num_vals is not None:
+                    actual_num_vals = stripe_df[col].count()
+                    assert stats_num_vals == actual_num_vals
 
 
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])

From bdc1f3a6e1f383cd689ba8e92903b89e49cdb8d8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 18 Sep 2023 19:34:29 -0400
Subject: [PATCH 183/230] Expose streams in public strings case APIs (#14056)

Add stream parameter to public strings APIs:
- `cudf::strings::capitalize()`
- `cudf::strings::title()`
- `cudf::strings::is_title()`
- `cudf::strings::to_lower()`
- `cudf::strings::to_upper()`
- `cudf::strings::swapcase()`

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14056
---
 cpp/include/cudf/strings/capitalize.hpp | 28 ++++++++-----
 cpp/include/cudf/strings/case.hpp       |  8 +++-
 cpp/src/strings/capitalize.cu           |  9 ++--
 cpp/src/strings/case.cu                 |  9 ++--
 cpp/tests/CMakeLists.txt                |  1 +
 cpp/tests/streams/strings/case_test.cpp | 55 +++++++++++++++++++++++++
 6 files changed, 92 insertions(+), 18 deletions(-)
 create mode 100644 cpp/tests/streams/strings/case_test.cpp

diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 6d01ab047ba..57375e9ac6a 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,16 +50,18 @@ namespace strings {
  *
  * Any null string entries return corresponding null output column entries.
  *
- * @throw cudf::logic_error if `delimiter.is_valid()` is  `false`.
+ * @throw cudf::logic_error if `delimiter.is_valid()` is `false`.
  *
- * @param input String column.
- * @param delimiters Characters for identifying words to capitalize.
+ * @param input String column
+ * @param delimiters Characters for identifying words to capitalize
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Column of strings capitalized from the input column.
+ * @return Column of strings capitalized from the input column
  */
 std::unique_ptr<column> capitalize(
   strings_column_view const& input,
-  string_scalar const& delimiters     = string_scalar(""),
+  string_scalar const& delimiters     = string_scalar("", true, cudf::get_default_stream()),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,14 +85,16 @@ std::unique_ptr<column> capitalize(
  *
  * Any null string entries return corresponding null output column entries.
  *
- * @param input String column.
- * @param sequence_type The character type that is used when identifying words.
+ * @param input String column
+ * @param sequence_type The character type that is used when identifying words
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Column of titled strings.
+ * @return Column of titled strings
  */
 std::unique_ptr<column> title(
   strings_column_view const& input,
   string_character_types sequence_type = string_character_types::ALPHA,
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -112,12 +116,14 @@ std::unique_ptr<column> title(
  *
  * Any null string entries result in corresponding null output column entries.
  *
- * @param input String column.
+ * @param input String column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Column of type BOOL8.
+ * @return Column of type BOOL8
  */
 std::unique_ptr<column> is_title(
   strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 06ba4f8d882..94191686a92 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,11 +38,13 @@ namespace strings {
  * Any null entries create null entries in the output column.
  *
  * @param strings Strings instance for this operation.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column of strings with characters converted.
  */
 std::unique_ptr<column> to_lower(
   strings_column_view const& strings,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -55,11 +57,13 @@ std::unique_ptr<column> to_lower(
  * Any null entries create null entries in the output column.
  *
  * @param strings Strings instance for this operation.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column of strings with characters converted.
  */
 std::unique_ptr<column> to_upper(
   strings_column_view const& strings,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -73,11 +77,13 @@ std::unique_ptr<column> to_upper(
  * Any null entries create null entries in the output column.
  *
  * @param strings Strings instance for this operation.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column of strings with characters converted.
  */
 std::unique_ptr<column> swapcase(
   strings_column_view const& strings,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 4e248922702..c555031b588 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -287,25 +287,28 @@ std::unique_ptr<column> is_title(strings_column_view const& input,
 
 std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    string_scalar const& delimiter,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::capitalize(input, delimiter, cudf::get_default_stream(), mr);
+  return detail::capitalize(input, delimiter, stream, mr);
 }
 
 std::unique_ptr<column> title(strings_column_view const& input,
                               string_character_types sequence_type,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::title(input, sequence_type, cudf::get_default_stream(), mr);
+  return detail::title(input, sequence_type, stream, mr);
 }
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_title(input, cudf::get_default_stream(), mr);
+  return detail::is_title(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index c5fe7a19f53..8f4c2ee574a 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -310,24 +310,27 @@ std::unique_ptr<column> swapcase(strings_column_view const& strings,
 // APIs
 
 std::unique_ptr<column> to_lower(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_lower(strings, cudf::get_default_stream(), mr);
+  return detail::to_lower(strings, stream, mr);
 }
 
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_upper(strings, cudf::get_default_stream(), mr);
+  return detail::to_upper(strings, stream, mr);
 }
 
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::swapcase(strings, cudf::get_default_stream(), mr);
+  return detail::swapcase(strings, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a69dc9bf2f8..4923ef5c903 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -627,6 +627,7 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_STRINGS_TEST streams/strings/case_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/strings/case_test.cpp b/cpp/tests/streams/strings/case_test.cpp
new file mode 100644
index 00000000000..df3eabd773a
--- /dev/null
+++ b/cpp/tests/streams/strings/case_test.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/capitalize.hpp>
+#include <cudf/strings/case.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class StringsCaseTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsCaseTest, LowerUpper)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"",
+                                        "The quick brown fox",
+                                        "jumps over the lazy dog.",
+                                        "all work and no play makes Jack a dull boy",
+                                        R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
+  auto view = cudf::strings_column_view(input);
+
+  cudf::strings::to_lower(view, cudf::test::get_default_stream());
+  cudf::strings::to_upper(view, cudf::test::get_default_stream());
+  cudf::strings::swapcase(view, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCaseTest, Capitalize)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"",
+                                        "The Quick Brown Fox",
+                                        "jumps over the lazy dog",
+                                        "all work and no play makes Jack a dull boy"});
+  auto view = cudf::strings_column_view(input);
+
+  auto const delimiter = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  cudf::strings::capitalize(view, delimiter, cudf::test::get_default_stream());
+  cudf::strings::is_title(view, cudf::test::get_default_stream());
+  cudf::strings::title(
+    view, cudf::strings::string_character_types::ALPHA, cudf::test::get_default_stream());
+}

From c016b58b24e63468e9110a6ca82adfc5fd61202d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Sep 2023 07:50:20 -0500
Subject: [PATCH 184/230] Update to clang 16.0.6. (#14120)

This PR updates cudf to use clang 16.0.6. The previous version 16.0.1 has some minor formatting issues affecting several RAPIDS repos.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14120
---
 .pre-commit-config.yaml                       |   2 +-
 cpp/benchmarks/iterator/iterator.cu           |   2 +-
 .../stream_compaction/apply_boolean_mask.cpp  |   4 +-
 cpp/benchmarks/string/char_types.cpp          |   2 +-
 cpp/benchmarks/string/extract.cpp             |   2 +-
 .../cudf/column/column_device_view.cuh        |   2 +-
 cpp/include/cudf/detail/copy_if.cuh           |   2 +-
 cpp/include/cudf/detail/indexalator.cuh       |   4 +-
 cpp/include/cudf/detail/join.hpp              |   4 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |   2 +-
 cpp/include/cudf/groupby.hpp                  |   4 +-
 cpp/include/cudf/io/csv.hpp                   |   2 +-
 cpp/include/cudf/io/json.hpp                  |   2 +-
 cpp/include/cudf/strings/detail/utf8.hpp      |  36 ++--
 cpp/include/cudf/table/row_operators.cuh      |   4 +-
 cpp/include/cudf/table/table_view.hpp         |   2 +-
 cpp/include/cudf/wrappers/dictionary.hpp      |   2 +-
 cpp/include/cudf_test/base_fixture.hpp        |   4 +-
 cpp/include/nvtext/subword_tokenize.hpp       |   2 +-
 cpp/scripts/run-clang-tidy.py                 |   2 +-
 cpp/src/copying/contiguous_split.cu           |   8 +-
 cpp/src/groupby/sort/functors.hpp             |  10 +-
 cpp/src/io/avro/avro_gpu.cu                   |   2 +-
 cpp/src/io/comp/cpu_unbz2.cpp                 |   2 +-
 cpp/src/io/comp/debrotli.cu                   |   4 +-
 cpp/src/io/comp/gpuinflate.cu                 |  18 +-
 cpp/src/io/comp/uncomp.cpp                    |  10 +-
 cpp/src/io/comp/unsnap.cu                     |   2 +-
 cpp/src/io/json/json_column.cu                |   2 +-
 cpp/src/io/json/nested_json_gpu.cu            | 160 +++++++++---------
 cpp/src/io/orc/orc_gpu.hpp                    |   2 +-
 cpp/src/io/orc/stripe_data.cu                 |   4 +-
 .../io/parquet/compact_protocol_reader.cpp    |   2 +-
 .../io/parquet/compact_protocol_writer.cpp    |   2 +-
 cpp/src/io/parquet/delta_binary.cuh           |  20 +--
 cpp/src/io/parquet/page_delta_decode.cu       |   2 +-
 cpp/src/io/parquet/parquet.hpp                |   4 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  22 +--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   2 +-
 cpp/src/join/join.cu                          |   4 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |   2 +-
 .../rolling/detail/rolling_collect_list.cuh   |   2 +-
 cpp/src/strings/char_types/char_types.cu      |   4 +-
 cpp/src/strings/convert/convert_datetime.cu   |   6 +-
 cpp/src/strings/convert/convert_durations.cu  |   2 +-
 cpp/src/strings/convert/convert_floats.cu     |   6 +-
 cpp/src/strings/convert/convert_integers.cu   |   2 +-
 cpp/src/strings/convert/convert_ipv4.cu       |   2 +-
 cpp/src/strings/convert/convert_urls.cu       |   4 +-
 cpp/src/strings/json/json_path.cu             |   2 +-
 cpp/src/strings/regex/regcomp.cpp             |  14 +-
 cpp/src/strings/regex/regcomp.h               |   8 +-
 cpp/src/strings/regex/regex.cuh               |  18 +-
 cpp/src/strings/regex/regex.inl               |  10 +-
 cpp/src/strings/replace/replace_re.cu         |   2 +-
 cpp/src/strings/split/partition.cu            |   2 +-
 cpp/src/strings/split/split.cuh               |   2 +-
 cpp/src/strings/split/split_re.cu             |   2 +-
 cpp/src/strings/utilities.cu                  |   6 +-
 cpp/src/text/normalize.cu                     |   4 +-
 cpp/src/text/replace.cu                       |   2 +-
 cpp/src/text/subword/bpe_tokenizer.cu         |   2 +-
 cpp/src/text/subword/load_merges_file.cu      |   2 +-
 cpp/src/text/utilities/tokenize_ops.cuh       |   2 +-
 cpp/tests/groupby/merge_lists_tests.cpp       |   2 +-
 cpp/tests/groupby/merge_sets_tests.cpp        |  12 +-
 cpp/tests/io/parquet_test.cpp                 |   6 +-
 cpp/tests/lists/reverse_tests.cpp             |   8 +-
 .../difference_distinct_tests.cpp             |   2 +-
 .../intersect_distinct_tests.cpp              |   4 +-
 .../set_operations/union_distinct_tests.cpp   |   4 +-
 .../stream_compaction/distinct_tests.cpp      |  10 +-
 .../reshape/interleave_columns_tests.cpp      |   2 +-
 .../rolling/range_rolling_window_test.cpp     |   2 +-
 cpp/tests/sort/segmented_sort_tests.cpp       |   2 +-
 cpp/tests/strings/chars_types_tests.cpp       |  12 +-
 cpp/tests/strings/durations_tests.cpp         |   8 +-
 cpp/tests/utilities/column_utilities.cu       |   2 +-
 78 files changed, 276 insertions(+), 276 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 238e5b44030..7e44091774f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -63,7 +63,7 @@ repos:
                 # Explicitly specify the pyproject.toml at the repo root, not per-project.
                 args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v16.0.1
+        rev: v16.0.6
         hooks:
               - id: clang-format
                 types_or: [c, c++, cuda]
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index 7acf24c30a5..dcd13cf62c4 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -145,7 +145,7 @@ void BM_iterator(benchmark::State& state)
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
       if (raw_or_iterator) {
-        raw_stream_bench_cub<T>(hasnull_F, dev_result);       // driven by raw pointer
+        raw_stream_bench_cub<T>(hasnull_F, dev_result);  // driven by raw pointer
       } else {
         iterator_bench_cub<T, false>(hasnull_F, dev_result);  // driven by riterator without nulls
       }
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index a6feaf04842..f78aa9fa654 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -59,8 +59,8 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns)
   int64_t const column_bytes_in    = column_bytes_out;  // we only read unmasked inputs
 
   int64_t const bytes_read =
-    (column_bytes_in + validity_bytes_in) * num_columns +   // reading columns
-    mask_size;                                              // reading boolean mask
+    (column_bytes_in + validity_bytes_in) * num_columns +  // reading columns
+    mask_size;                                             // reading boolean mask
   int64_t const bytes_written =
     (column_bytes_out + validity_bytes_out) * num_columns;  // writing columns
 
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
index 8e9e595fcef..59e6245fd41 100644
--- a/cpp/benchmarks/string/char_types.cpp
+++ b/cpp/benchmarks/string/char_types.cpp
@@ -43,7 +43,7 @@ static void bench_char_types(nvbench::state& state)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
   auto chars_size = input.chars_size();
-  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   if (api_type == "all") {
     state.add_global_memory_writes<nvbench::int8_t>(num_rows);  // output is a bool8 per row
   } else {
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index 9e67c5a5b52..135dadabbe4 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -43,7 +43,7 @@ static void bench_extract(nvbench::state& state)
   std::uniform_int_distribution<int> words_dist(0, 999);
   std::vector<std::string> samples(100);  // 100 unique rows of data to reuse
   std::generate(samples.begin(), samples.end(), [&]() {
-    std::string row;                      // build a row of random tokens
+    std::string row;  // build a row of random tokens
     while (static_cast<cudf::size_type>(row.size()) < row_width) {
       row += std::to_string(words_dist(generator)) + " ";
     }
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 05ef21bd750..35851a99822 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -1393,7 +1393,7 @@ struct pair_accessor {
  */
 template <typename T, bool has_nulls = false>
 struct pair_rep_accessor {
-  column_device_view const col;               ///< column view of column in device
+  column_device_view const col;  ///< column view of column in device
 
   using rep_type = device_storage_type_t<T>;  ///< representation type
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 1dd91dcd865..ebe7e052b6d 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -133,7 +133,7 @@ __launch_bounds__(block_size) __global__
     if (has_validity) {
       temp_valids[threadIdx.x] = false;  // init shared memory
       if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false;
-      __syncthreads();                   // wait for init
+      __syncthreads();  // wait for init
     }
 
     if (mask_true) {
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 0ab9da0dbd0..4731c4919e3 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -248,7 +248,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
   friend struct indexalator_factory;
   friend struct base_indexalator<input_indexalator>;  // for CRTP
 
-  using reference = size_type const;                  // this keeps STL and thrust happy
+  using reference = size_type const;  // this keeps STL and thrust happy
 
   input_indexalator()                                    = default;
   input_indexalator(input_indexalator const&)            = default;
@@ -332,7 +332,7 @@ struct output_indexalator : base_indexalator<output_indexalator> {
   friend struct indexalator_factory;
   friend struct base_indexalator<output_indexalator>;  // for CRTP
 
-  using reference = output_indexalator const&;         // required for output iterators
+  using reference = output_indexalator const&;  // required for output iterators
 
   output_indexalator()                                     = default;
   output_indexalator(output_indexalator const&)            = default;
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index 6fcf10aef57..b69632c83ca 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -78,8 +78,8 @@ struct hash_join {
   cudf::null_equality const _nulls_equal;  ///< whether to consider nulls as equal
   cudf::table_view _build;                 ///< input table to build the hash map
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
-    _preprocessed_build;                   ///< input table preprocssed for row operators
-  map_type _hash_table;                    ///< hash table built on `_build`
+    _preprocessed_build;  ///< input table preprocssed for row operators
+  map_type _hash_table;   ///< hash table built on `_build`
 
  public:
   /**
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 7c59c2f9194..13d8716c1df 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -829,5 +829,5 @@ using decimal32  = fixed_point<int32_t, Radix::BASE_10>;     ///<  32-bit decima
 using decimal64  = fixed_point<int64_t, Radix::BASE_10>;     ///<  64-bit decimal fixed point
 using decimal128 = fixed_point<__int128_t, Radix::BASE_10>;  ///< 128-bit decimal fixed point
 
-/** @} */                                                    // end of group
+/** @} */  // end of group
 }  // namespace numeric
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 6e575685daa..1c31e8777a8 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -386,8 +386,8 @@ class groupby {
                                                          ///< indicates null order
                                                          ///< of each column
   std::unique_ptr<detail::sort::sort_groupby_helper>
-    _helper;                                             ///< Helper object
-                                                         ///< used by sort based implementation
+    _helper;  ///< Helper object
+              ///< used by sort based implementation
 
   /**
    * @brief Get the sort helper object
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index c84ca7e6c73..b49a13a8ea9 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -213,7 +213,7 @@ class csv_reader_options {
 
     auto const max_row_bytes = 16 * 1024;  // 16KB
     auto const column_bytes  = 64;
-    auto const base_padding  = 1024;       // 1KB
+    auto const base_padding  = 1024;  // 1KB
 
     if (num_columns == 0) {
       // Use flat size if the number of columns is not known
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 15dc2a614ad..d408d249a7f 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -207,7 +207,7 @@ class json_reader_options {
 
     auto const max_row_bytes = 16 * 1024;  // 16KB
     auto const column_bytes  = 64;
-    auto const base_padding  = 1024;       // 1KB
+    auto const base_padding  = 1024;  // 1KB
 
     if (num_columns == 0) {
       // Use flat size if the number of columns is not known
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index df8e2885782..e04572535de 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -155,18 +155,18 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
 constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
 {
   uint32_t unchr = 0;
-  if (utf8_char < 0x0000'0080)                // single-byte pass thru
+  if (utf8_char < 0x0000'0080)  // single-byte pass thru
     unchr = utf8_char;
-  else if (utf8_char < 0x0000'E000)           // two bytes
+  else if (utf8_char < 0x0000'E000)  // two bytes
   {
-    unchr = (utf8_char & 0x1F00) >> 2;        // shift and
-    unchr |= (utf8_char & 0x003F);            // unmask
-  } else if (utf8_char < 0x00F0'0000)         // three bytes
+    unchr = (utf8_char & 0x1F00) >> 2;  // shift and
+    unchr |= (utf8_char & 0x003F);      // unmask
+  } else if (utf8_char < 0x00F0'0000)   // three bytes
   {
-    unchr = (utf8_char & 0x0F'0000) >> 4;     // get upper 4 bits
-    unchr |= (utf8_char & 0x00'3F00) >> 2;    // shift and
-    unchr |= (utf8_char & 0x00'003F);         // unmask
-  } else if (utf8_char <= 0xF800'0000u)       // four bytes
+    unchr = (utf8_char & 0x0F'0000) >> 4;   // get upper 4 bits
+    unchr |= (utf8_char & 0x00'3F00) >> 2;  // shift and
+    unchr |= (utf8_char & 0x00'003F);       // unmask
+  } else if (utf8_char <= 0xF800'0000u)     // four bytes
   {
     unchr = (utf8_char & 0x0300'0000) >> 6;   // upper 3 bits
     unchr |= (utf8_char & 0x003F'0000) >> 4;  // next 6 bits
@@ -185,20 +185,20 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
 constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
 {
   cudf::char_utf8 utf8 = 0;
-  if (unchr < 0x0000'0080)               // single byte utf8
+  if (unchr < 0x0000'0080)  // single byte utf8
     utf8 = unchr;
-  else if (unchr < 0x0000'0800)          // double byte utf8
+  else if (unchr < 0x0000'0800)  // double byte utf8
   {
-    utf8 = (unchr << 2) & 0x1F00;        // shift bits for
-    utf8 |= (unchr & 0x3F);              // utf8 encoding
+    utf8 = (unchr << 2) & 0x1F00;  // shift bits for
+    utf8 |= (unchr & 0x3F);        // utf8 encoding
     utf8 |= 0x0000'C080;
-  } else if (unchr < 0x0001'0000)        // triple byte utf8
+  } else if (unchr < 0x0001'0000)  // triple byte utf8
   {
-    utf8 = (unchr << 4) & 0x0F'0000;     // upper 4 bits
-    utf8 |= (unchr << 2) & 0x00'3F00;    // next 6 bits
-    utf8 |= (unchr & 0x3F);              // last 6 bits
+    utf8 = (unchr << 4) & 0x0F'0000;   // upper 4 bits
+    utf8 |= (unchr << 2) & 0x00'3F00;  // next 6 bits
+    utf8 |= (unchr & 0x3F);            // last 6 bits
     utf8 |= 0x00E0'8080;
-  } else if (unchr < 0x0011'0000)        // quadruple byte utf8
+  } else if (unchr < 0x0011'0000)  // quadruple byte utf8
   {
     utf8 = (unchr << 6) & 0x0700'0000;   // upper 3 bits
     utf8 |= (unchr << 4) & 0x003F'0000;  // next 6 bits
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 599a85c8a54..4806f96c934 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -105,9 +105,9 @@ inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_ord
 {
   if (lhs_is_null and rhs_is_null) {  // null <? null
     return weak_ordering::EQUIVALENT;
-  } else if (lhs_is_null) {           // null <? x
+  } else if (lhs_is_null) {  // null <? x
     return (null_precedence == null_order::BEFORE) ? weak_ordering::LESS : weak_ordering::GREATER;
-  } else if (rhs_is_null) {           // x <? null
+  } else if (rhs_is_null) {  // x <? null
     return (null_precedence == null_order::AFTER) ? weak_ordering::LESS : weak_ordering::GREATER;
   }
   return weak_ordering::EQUIVALENT;
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 6f779bd457a..b90b2dac012 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -152,7 +152,7 @@ class table_view_base {
 
   table_view_base(table_view_base const&) = default;  ///< Copy constructor
 
-  table_view_base(table_view_base&&) = default;       ///< Move constructor
+  table_view_base(table_view_base&&) = default;  ///< Move constructor
   /**
    * @brief Copy assignment operator
    *
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index 98de549c724..329f1fa7754 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -215,5 +215,5 @@ CUDF_HOST_DEVICE inline bool operator>(dictionary_wrapper<Integer> const& lhs,
 
 using dictionary32 = dictionary_wrapper<int32_t>;  ///< 32-bit integer indexed dictionary wrapper
 
-/** @} */                                          // end of group
+/** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index b622d7c6b78..06aabbe4e9c 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -331,9 +331,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
     cxxopts::Options options(argv[0], " - cuDF tests command line options");
     char const* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE");  // Overridden by CLI options
     char const* env_stream_mode =
-      std::getenv("GTEST_CUDF_STREAM_MODE");                        // Overridden by CLI options
+      std::getenv("GTEST_CUDF_STREAM_MODE");  // Overridden by CLI options
     char const* env_stream_error_mode =
-      std::getenv("GTEST_CUDF_STREAM_ERROR_MODE");                  // Overridden by CLI options
+      std::getenv("GTEST_CUDF_STREAM_ERROR_MODE");  // Overridden by CLI options
     auto default_rmm_mode          = env_rmm_mode ? env_rmm_mode : "pool";
     auto default_stream_mode       = env_stream_mode ? env_stream_mode : "default";
     auto default_stream_error_mode = env_stream_error_mode ? env_stream_error_mode : "error";
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index ac75f5e9147..72a899d70b4 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -44,7 +44,7 @@ struct hashed_vocabulary {
   std::unique_ptr<cudf::column> bin_offsets;  ///< uint16 column, containing the start index of each
                                               ///< bin in the flattened hash table
   std::unique_ptr<cudf::column>
-    cp_metadata;   ///< uint32 column, The code point metadata table to use for normalization
+    cp_metadata;  ///< uint32 column, The code point metadata table to use for normalization
   std::unique_ptr<cudf::column>
     aux_cp_table;  ///< uint64 column, The auxiliary code point table to use for normalization
 };
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index a617a4c0df7..e5e57dbf562 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -22,7 +22,7 @@
 import shutil
 
 
-EXPECTED_VERSION = "16.0.1"
+EXPECTED_VERSION = "16.0.6"
 VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
 GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
 SPACES = re.compile(r"\s+")
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index e1a55ec5419..5ea56a05dcb 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -114,8 +114,8 @@ struct dst_buf_info {
   int bit_shift;           // # of bits to shift right by (for validity buffers)
   size_type valid_count;   // validity count for this block of work
 
-  int src_buf_index;       // source buffer index
-  int dst_buf_index;       // destination buffer index
+  int src_buf_index;  // source buffer index
+  int dst_buf_index;  // destination buffer index
 };
 
 /**
@@ -1384,7 +1384,7 @@ struct chunk_iteration_state {
   std::size_t starting_batch;  ///< Starting batch index for the current iteration
   std::vector<std::size_t> const h_num_buffs_per_iteration;  ///< The count of batches per iteration
   std::vector<std::size_t> const
-    h_size_of_buffs_per_iteration;                           ///< The size in bytes per iteration
+    h_size_of_buffs_per_iteration;  ///< The size in bytes per iteration
 };
 
 std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
@@ -1989,7 +1989,7 @@ struct contiguous_split_state {
   // This can be 1 if `contiguous_split` is just packing and not splitting
   std::size_t const num_partitions;  ///< The number of partitions to produce
 
-  size_type const num_src_bufs;      ///< Number of source buffers including children
+  size_type const num_src_bufs;  ///< Number of source buffers including children
 
   std::size_t const num_bufs;  ///< Number of source buffers including children * number of splits
 
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index c378ac99727..be36956b929 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -94,12 +94,12 @@ struct store_result_functor {
   };
 
  protected:
-  sort::sort_groupby_helper& helper;       ///< Sort helper
-  cudf::detail::result_cache& cache;       ///< cache of results to store into
-  column_view const& values;               ///< Column of values to group and aggregate
+  sort::sort_groupby_helper& helper;  ///< Sort helper
+  cudf::detail::result_cache& cache;  ///< cache of results to store into
+  column_view const& values;          ///< Column of values to group and aggregate
 
-  rmm::cuda_stream_view stream;            ///< CUDA stream on which to execute kernels
-  rmm::mr::device_memory_resource* mr;     ///< Memory resource to allocate space for results
+  rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
+  rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
 
   sorted keys_are_sorted;                  ///< Whether the keys are sorted
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 2c634d9b590..365f6d6875c 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -303,7 +303,7 @@ avro_decode_row(schemadesc_s const* schema,
     // If within an array, check if we reached the last item
     if (array_repeat_count != 0 && array_children <= 0 && cur < end) {
       if (!--array_repeat_count) {
-        i = array_start;                   // Restart at the array parent
+        i = array_start;  // Restart at the array parent
       } else {
         i              = array_start + 1;  // Restart after the array parent
         array_children = schema[array_start].count;
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index 7159ff30d7c..a116335b254 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -216,7 +216,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
 
   s->currBlockNo++;
 
-  skipbits(s, 32);                          // block CRC
+  skipbits(s, 32);  // block CRC
 
   if (getbits(s, 1)) return BZ_DATA_ERROR;  // blockRandomized not supported (old bzip versions)
 
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 542ca031b7c..8bafd054bdb 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -121,7 +121,7 @@ __inline__ __device__ int brotli_context(int p1, int p2, int lut)
 struct huff_scratch_s {
   uint16_t code_length_histo[16];
   uint8_t code_length_code_lengths[brotli_code_length_codes];
-  int8_t offset[6];                           // offsets in sorted table for each length
+  int8_t offset[6];  // offsets in sorted table for each length
   uint16_t lenvlctab[32];
   uint16_t sorted[brotli_code_length_codes];  // symbols sorted by code length
   int16_t next_symbol[32];
@@ -1298,7 +1298,7 @@ static __device__ void InverseMoveToFrontTransform(debrotli_state_s* s, uint8_t*
   // Reinitialize elements that could have been changed.
   uint32_t i           = 1;
   uint32_t upper_bound = s->mtf_upper_bound;
-  uint32_t* mtf        = &s->mtf[1];   // Make mtf[-1] addressable.
+  uint32_t* mtf        = &s->mtf[1];  // Make mtf[-1] addressable.
   auto* mtf_u8         = reinterpret_cast<uint8_t*>(mtf);
   uint32_t pattern     = 0x0302'0100;  // Little-endian
 
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 42c4fbe7bea..8993815e560 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -124,11 +124,11 @@ struct inflate_state_s {
   uint8_t* outbase;  ///< start of output buffer
   uint8_t* outend;   ///< end of output buffer
   // Input state
-  uint8_t const* cur;       ///< input buffer
-  uint8_t const* end;       ///< end of input buffer
+  uint8_t const* cur;  ///< input buffer
+  uint8_t const* end;  ///< end of input buffer
 
-  uint2 bitbuf;             ///< bit buffer (64-bit)
-  uint32_t bitpos;          ///< position in bit buffer
+  uint2 bitbuf;     ///< bit buffer (64-bit)
+  uint32_t bitpos;  ///< position in bit buffer
 
   int32_t err;              ///< Error status
   int btype;                ///< current block type
@@ -295,7 +295,7 @@ __device__ int construct(
     return 0;                    // complete, but decode() will fail
 
   // check for an over-subscribed or incomplete set of lengths
-  left = 1;                     // one possible code of zero length
+  left = 1;  // one possible code of zero length
   for (len = 1; len <= max_bits; len++) {
     left <<= 1;                 // one more bit, double codes left
     left -= counts[len];        // deduct count from possible codes
@@ -349,8 +349,8 @@ __device__ int init_dynamic(inflate_state_s* s)
   index = 0;
   while (index < nlen + ndist) {
     int symbol = decode(s, s->lencnt, s->lensym);
-    if (symbol < 0) return symbol;    // invalid symbol
-    if (symbol < 16)                  // length in 0..15
+    if (symbol < 0) return symbol;  // invalid symbol
+    if (symbol < 16)                // length in 0..15
       lengths[index++] = symbol;
     else {                            // repeat instruction
       int len = 0;                    // last length to repeat, assume repeating zeros
@@ -358,9 +358,9 @@ __device__ int init_dynamic(inflate_state_s* s)
         if (index == 0) return -5;    // no last length!
         len    = lengths[index - 1];  // last length
         symbol = 3 + getbits(s, 2);
-      } else if (symbol == 17)        // repeat zero 3..10 times
+      } else if (symbol == 17)  // repeat zero 3..10 times
         symbol = 3 + getbits(s, 3);
-      else                            // == 18, repeat zero 11..138 times
+      else  // == 18, repeat zero 11..138 times
         symbol = 11 + getbits(s, 7);
       if (index + symbol > nlen + ndist) return -6;  // too many lengths!
       while (symbol--)                               // repeat last or zero symbol times
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 017fd8abb47..0d2d21333bb 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -28,7 +28,7 @@
 
 #include <cstring>  // memset
 
-#include <zlib.h>   // uncompress
+#include <zlib.h>  // uncompress
 
 using cudf::host_span;
 
@@ -47,7 +47,7 @@ struct gz_file_header_s {
   uint8_t os;         // OS id
 };
 
-struct zip_eocd_s          // end of central directory
+struct zip_eocd_s  // end of central directory
 {
   uint32_t sig;            // 0x0605'4b50
   uint16_t disk_id;        // number of this disk
@@ -59,7 +59,7 @@ struct zip_eocd_s          // end of central directory
                          // number uint16_t comment_len;   // comment length (excluded from struct)
 };
 
-struct zip64_eocdl      // end of central dir locator
+struct zip64_eocdl  // end of central dir locator
 {
   uint32_t sig;         // 0x0706'4b50
   uint32_t disk_start;  // number of the disk with the start of the zip64 end of central directory
@@ -67,7 +67,7 @@ struct zip64_eocdl      // end of central dir locator
   uint32_t num_disks;   // total number of disks
 };
 
-struct zip_cdfh_s        // central directory file header
+struct zip_cdfh_s  // central directory file header
 {
   uint32_t sig;          // 0x0201'4b50
   uint16_t ver;          // version made by
@@ -111,7 +111,7 @@ struct bz2_file_header_s {
 
 struct gz_archive_s {
   gz_file_header_s const* fhdr;
-  uint16_t hcrc16;           // header crc16 if present
+  uint16_t hcrc16;  // header crc16 if present
   uint16_t xlen;
   uint8_t const* fxtra;      // xlen bytes (optional)
   uint8_t const* fname;      // zero-terminated original filename if present
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index a7a1cfd3f9e..c699502317f 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -45,7 +45,7 @@ void __device__ busy_wait(size_t cycles)
 struct unsnap_batch_s {
   int32_t len;  // 1..64 = Number of bytes
   uint32_t
-    offset;     // copy distance if greater than zero or negative of literal offset in byte stream
+    offset;  // copy distance if greater than zero or negative of literal offset in byte stream
 };
 
 /**
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index bdad16bd9f1..cabf904f020 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -169,7 +169,7 @@ reduce_to_column_tree(tree_meta_t& tree,
     });
 
   // 4. unique_copy parent_node_ids, ranges
-  rmm::device_uvector<TreeDepthT> column_levels(0, stream);                 // not required
+  rmm::device_uvector<TreeDepthT> column_levels(0, stream);  // not required
   rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
   rmm::device_uvector<SymbolOffsetT> col_range_begin(num_columns, stream);  // Field names
   rmm::device_uvector<SymbolOffsetT> col_range_end(num_columns, stream);
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index b691eaa8caf..0b49f97597d 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -762,18 +762,18 @@ auto get_translation_table(bool include_line_delimiter)
                                                         nl_tokens({}),   // LINE_BREAK
                                                         {ValueBegin}}};  // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BOA)] = {
-    {                                                                    /*ROOT*/
-     {ErrorBegin},                                                       // OPENING_BRACE
-     {ErrorBegin},                                                       // OPENING_BRACKET
-     {ErrorBegin},                                                       // CLOSING_BRACE
-     {ErrorBegin},                                                       // CLOSING_BRACKET
-     {ErrorBegin},                                                       // QUOTE
-     {ErrorBegin},                                                       // ESCAPE
-     {ErrorBegin},                                                       // COMMA
-     {ErrorBegin},                                                       // COLON
-     {ErrorBegin},                                                       // WHITE_SPACE
-     nl_tokens({ErrorBegin}),                                            // LINE_BREAK
-     {ErrorBegin},                                                       // OTHER
+    {                          /*ROOT*/
+     {ErrorBegin},             // OPENING_BRACE
+     {ErrorBegin},             // OPENING_BRACKET
+     {ErrorBegin},             // CLOSING_BRACE
+     {ErrorBegin},             // CLOSING_BRACKET
+     {ErrorBegin},             // QUOTE
+     {ErrorBegin},             // ESCAPE
+     {ErrorBegin},             // COMMA
+     {ErrorBegin},             // COLON
+     {ErrorBegin},             // WHITE_SPACE
+     nl_tokens({ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},             // OTHER
      /*LIST*/
      {StructBegin},  // OPENING_BRACE
      {ListBegin},    // OPENING_BRACKET
@@ -799,18 +799,18 @@ auto get_translation_table(bool include_line_delimiter)
      nl_tokens({}),                        // LINE_BREAK
      {ErrorBegin}}};                       // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_LON)] = {
-    {                                      /*ROOT*/
-     {ErrorBegin},                         // OPENING_BRACE
-     {ErrorBegin},                         // OPENING_BRACKET
-     {ErrorBegin},                         // CLOSING_BRACE
-     {ErrorBegin},                         // CLOSING_BRACKET
-     {ErrorBegin},                         // QUOTE
-     {ErrorBegin},                         // ESCAPE
-     {ErrorBegin},                         // COMMA
-     {ErrorBegin},                         // COLON
-     {ValueEnd},                           // WHITE_SPACE
-     nl_tokens({ValueEnd}),                // LINE_BREAK
-     {},                                   // OTHER
+    {                        /*ROOT*/
+     {ErrorBegin},           // OPENING_BRACE
+     {ErrorBegin},           // OPENING_BRACKET
+     {ErrorBegin},           // CLOSING_BRACE
+     {ErrorBegin},           // CLOSING_BRACKET
+     {ErrorBegin},           // QUOTE
+     {ErrorBegin},           // ESCAPE
+     {ErrorBegin},           // COMMA
+     {ErrorBegin},           // COLON
+     {ValueEnd},             // WHITE_SPACE
+     nl_tokens({ValueEnd}),  // LINE_BREAK
+     {},                     // OTHER
      /*LIST*/
      {ErrorBegin},           // OPENING_BRACE
      {ErrorBegin},           // OPENING_BRACKET
@@ -824,17 +824,17 @@ auto get_translation_table(bool include_line_delimiter)
      nl_tokens({ValueEnd}),  // LINE_BREAK
      {},                     // OTHER
      /*STRUCT*/
-     {ErrorBegin},                                                      // OPENING_BRACE
-     {ErrorBegin},                                                      // OPENING_BRACKET
-     {ValueEnd, StructMemberEnd, StructEnd},                            // CLOSING_BRACE
-     {ErrorBegin},                                                      // CLOSING_BRACKET
-     {ErrorBegin},                                                      // QUOTE
-     {ErrorBegin},                                                      // ESCAPE
-     {ValueEnd, StructMemberEnd},                                       // COMMA
-     {ErrorBegin},                                                      // COLON
-     {ValueEnd},                                                        // WHITE_SPACE
-     nl_tokens({ValueEnd}),                                             // LINE_BREAK
-     {}}};                                                              // OTHER
+     {ErrorBegin},                            // OPENING_BRACE
+     {ErrorBegin},                            // OPENING_BRACKET
+     {ValueEnd, StructMemberEnd, StructEnd},  // CLOSING_BRACE
+     {ErrorBegin},                            // CLOSING_BRACKET
+     {ErrorBegin},                            // QUOTE
+     {ErrorBegin},                            // ESCAPE
+     {ValueEnd, StructMemberEnd},             // COMMA
+     {ErrorBegin},                            // COLON
+     {ValueEnd},                              // WHITE_SPACE
+     nl_tokens({ValueEnd}),                   // LINE_BREAK
+     {}}};                                    // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_STR)] = {{                /*ROOT*/
                                                         {},             // OPENING_BRACE
@@ -974,17 +974,17 @@ auto get_translation_table(bool include_line_delimiter)
      nl_tokens({ErrorBegin}),  // LINE_BREAK
      {ErrorBegin},             // OTHER
      /*STRUCT*/
-     {ErrorBegin},                                                                // OPENING_BRACE
-     {ErrorBegin},                                                                // OPENING_BRACKET
-     {StructEnd},                                                                 // CLOSING_BRACE
-     {ErrorBegin},                                                                // CLOSING_BRACKET
-     {StructMemberBegin, FieldNameBegin},                                         // QUOTE
-     {ErrorBegin},                                                                // ESCAPE
-     {ErrorBegin},                                                                // COMMA
-     {ErrorBegin},                                                                // COLON
-     {},                                                                          // WHITE_SPACE
-     nl_tokens({}),                                                               // LINE_BREAK
-     {ErrorBegin}}};                                                              // OTHER
+     {ErrorBegin},                         // OPENING_BRACE
+     {ErrorBegin},                         // OPENING_BRACKET
+     {StructEnd},                          // CLOSING_BRACE
+     {ErrorBegin},                         // CLOSING_BRACKET
+     {StructMemberBegin, FieldNameBegin},  // QUOTE
+     {ErrorBegin},                         // ESCAPE
+     {ErrorBegin},                         // COMMA
+     {ErrorBegin},                         // COLON
+     {},                                   // WHITE_SPACE
+     nl_tokens({}),                        // LINE_BREAK
+     {ErrorBegin}}};                       // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_FLN)] = {{                          /*ROOT*/
                                                         {ErrorBegin},             // OPENING_BRACE
@@ -1011,17 +1011,17 @@ auto get_translation_table(bool include_line_delimiter)
                                                         nl_tokens({ErrorBegin}),  // LINE_BREAK
                                                         {ErrorBegin},             // OTHER
                                                         /*STRUCT*/
-                                                        {},                       // OPENING_BRACE
-                                                        {},                       // OPENING_BRACKET
-                                                        {},                       // CLOSING_BRACE
-                                                        {},                       // CLOSING_BRACKET
-                                                        {FieldNameEnd},           // QUOTE
-                                                        {},                       // ESCAPE
-                                                        {},                       // COMMA
-                                                        {},                       // COLON
-                                                        {},                       // WHITE_SPACE
-                                                        nl_tokens({}),            // LINE_BREAK
-                                                        {}}};                     // OTHER
+                                                        {},              // OPENING_BRACE
+                                                        {},              // OPENING_BRACKET
+                                                        {},              // CLOSING_BRACE
+                                                        {},              // CLOSING_BRACKET
+                                                        {FieldNameEnd},  // QUOTE
+                                                        {},              // ESCAPE
+                                                        {},              // COMMA
+                                                        {},              // COLON
+                                                        {},              // WHITE_SPACE
+                                                        nl_tokens({}),   // LINE_BREAK
+                                                        {}}};            // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_FNE)] = {{                          /*ROOT*/
                                                         {ErrorBegin},             // OPENING_BRACE
@@ -1048,17 +1048,17 @@ auto get_translation_table(bool include_line_delimiter)
                                                         nl_tokens({ErrorBegin}),  // LINE_BREAK
                                                         {ErrorBegin},             // OTHER
                                                         /*STRUCT*/
-                                                        {},                       // OPENING_BRACE
-                                                        {},                       // OPENING_BRACKET
-                                                        {},                       // CLOSING_BRACE
-                                                        {},                       // CLOSING_BRACKET
-                                                        {},                       // QUOTE
-                                                        {},                       // ESCAPE
-                                                        {},                       // COMMA
-                                                        {},                       // COLON
-                                                        {},                       // WHITE_SPACE
-                                                        nl_tokens({}),            // LINE_BREAK
-                                                        {}}};                     // OTHER
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {}}};           // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_PFN)] = {{                          /*ROOT*/
                                                         {ErrorBegin},             // OPENING_BRACE
@@ -1097,18 +1097,18 @@ auto get_translation_table(bool include_line_delimiter)
                                                         nl_tokens({}),   // LINE_BREAK
                                                         {ErrorBegin}}};  // OTHER
 
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                 /*ROOT*/
-                                                        {},              // OPENING_BRACE
-                                                        {},              // OPENING_BRACKET
-                                                        {},              // CLOSING_BRACE
-                                                        {},              // CLOSING_BRACKET
-                                                        {},              // QUOTE
-                                                        {},              // ESCAPE
-                                                        {},              // COMMA
-                                                        {},              // COLON
-                                                        {},              // WHITE_SPACE
-                                                        nl_tokens({}),   // LINE_BREAK
-                                                        {},              // OTHER
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                /*ROOT*/
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {},             // OTHER
                                                         /*LIST*/
                                                         {},             // OPENING_BRACE
                                                         {},             // OPENING_BRACKET
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 681cc0fb9d2..9b8df50a22a 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -157,7 +157,7 @@ struct EncChunk {
   uint8_t dtype_len;                 // data type length
   int32_t scale;                     // scale for decimals or timestamps
 
-  uint32_t* dict_index;              // dictionary index from row index
+  uint32_t* dict_index;  // dictionary index from row index
   uint32_t* decimal_offsets;
   orc_column_device_view const* column;
 };
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index b66ca827119..3edcd3d83b2 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -367,14 +367,14 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
       if (zbit) {
         return 5 + (zbit >> 3);  // up to 9x7 bits
       } else if ((sizeof(T) <= 8) || (bytestream_readbyte(bs, pos + 9) <= 0x7f)) {
-        return 10;               // up to 70 bits
+        return 10;  // up to 70 bits
       } else {
         uint64_t next64 = bytestream_readu64(bs, pos + 10);
         zbit            = __ffsll((~next64) & 0x8080'8080'8080'8080ull);
         if (zbit) {
           return 10 + (zbit >> 3);  // Up to 18x7 bits (126)
         } else {
-          return 19;                // Up to 19x7 bits (133)
+          return 19;  // Up to 19x7 bits (133)
         }
       }
     }
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 92fcd151925..ae11af92f78 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -168,7 +168,7 @@ bool CompactProtocolReader::read(LogicalType* l)
                     ParquetFieldUnion(2, l->isset.MAP, l->MAP),
                     ParquetFieldUnion(3, l->isset.LIST, l->LIST),
                     ParquetFieldUnion(4, l->isset.ENUM, l->ENUM),
-                    ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL),      // read the struct
+                    ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL),  // read the struct
                     ParquetFieldUnion(6, l->isset.DATE, l->DATE),
                     ParquetFieldUnion(7, l->isset.TIME, l->TIME),            //  read the struct
                     ParquetFieldUnion(8, l->isset.TIMESTAMP, l->TIMESTAMP),  //  read the struct
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index b2a89129645..b2c0c97c52d 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -315,7 +315,7 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val)
   if constexpr (not std::is_empty_v<T>) {
     writer.write(val);  // write the struct if it's not empty
   } else {
-    put_byte(0);        // otherwise, add a stop field
+    put_byte(0);  // otherwise, add a stop field
   }
   current_field_value = field;
 }
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 4fc8b9cfb8e..2382e4aafdf 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -90,16 +90,16 @@ inline __device__ zigzag128_t get_zz128(uint8_t const*& cur, uint8_t const* end)
 }
 
 struct delta_binary_decoder {
-  uint8_t const* block_start;    // start of data, but updated as data is read
-  uint8_t const* block_end;      // end of data
-  uleb128_t block_size;          // usually 128, must be multiple of 128
-  uleb128_t mini_block_count;    // usually 4, chosen such that block_size/mini_block_count is a
-                                 // multiple of 32
-  uleb128_t value_count;         // total values encoded in the block
-  zigzag128_t last_value;        // last value decoded, initialized to first_value from header
-
-  uint32_t values_per_mb;        // block_size / mini_block_count, must be multiple of 32
-  uint32_t current_value_idx;    // current value index, initialized to 0 at start of block
+  uint8_t const* block_start;  // start of data, but updated as data is read
+  uint8_t const* block_end;    // end of data
+  uleb128_t block_size;        // usually 128, must be multiple of 128
+  uleb128_t mini_block_count;  // usually 4, chosen such that block_size/mini_block_count is a
+                               // multiple of 32
+  uleb128_t value_count;       // total values encoded in the block
+  zigzag128_t last_value;      // last value decoded, initialized to first_value from header
+
+  uint32_t values_per_mb;      // block_size / mini_block_count, must be multiple of 32
+  uint32_t current_value_idx;  // current value index, initialized to 0 at start of block
 
   zigzag128_t cur_min_delta;     // min delta for the block
   uint32_t cur_mb;               // index of the current mini-block within the block
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index e79a479388f..35f33a761be 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -85,7 +85,7 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
 
     if (t < 2 * warp_size) {  // warp0..1
       target_pos = min(src_pos + 2 * batch_size, s->nz_count + batch_size);
-    } else {                  // warp2
+    } else {  // warp2
       target_pos = min(s->nz_count, src_pos + batch_size);
     }
     __syncthreads();
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index a729f28d672..f7318bb9935 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -365,8 +365,8 @@ struct ColumnIndex {
   std::vector<std::vector<uint8_t>> min_values;  // lower bound for values in each page
   std::vector<std::vector<uint8_t>> max_values;  // upper bound for values in each page
   BoundaryOrder boundary_order =
-    BoundaryOrder::UNORDERED;                    // Indicates if min and max values are ordered
-  std::vector<int64_t> null_counts;              // Optional count of null values per page
+    BoundaryOrder::UNORDERED;        // Indicates if min and max values are ordered
+  std::vector<int64_t> null_counts;  // Optional count of null values per page
 };
 
 // bit space we are reserving in column_buffer::user_data
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index e82b6abc13d..a3cc37dee4f 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -299,7 +299,7 @@ struct ColumnChunkDesc {
   int8_t converted_type;                      // converted type enum
   LogicalType logical_type;                   // logical type
   int8_t decimal_precision;                   // Decimal precision
-  int32_t ts_clock_rate;   // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
+  int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index;   // my input column index
   int32_t src_col_schema;  // my schema index in the file
@@ -396,16 +396,16 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
 struct EncColumnChunk {
   parquet_column_device_view const* col_desc;  //!< Column description
   size_type col_desc_id;
-  PageFragment* fragments;                     //!< First fragment in chunk
-  uint8_t* uncompressed_bfr;                   //!< Uncompressed page data
-  uint8_t* compressed_bfr;                     //!< Compressed page data
-  statistics_chunk const* stats;               //!< Fragment statistics
-  uint32_t bfr_size;                           //!< Uncompressed buffer size
-  uint32_t compressed_size;                    //!< Compressed buffer size
-  uint32_t max_page_data_size;  //!< Max data size (excluding header) of any page in this chunk
-  uint32_t page_headers_size;   //!< Sum of size of all page headers
-  size_type start_row;          //!< First row of chunk
-  uint32_t num_rows;            //!< Number of rows in chunk
+  PageFragment* fragments;        //!< First fragment in chunk
+  uint8_t* uncompressed_bfr;      //!< Uncompressed page data
+  uint8_t* compressed_bfr;        //!< Compressed page data
+  statistics_chunk const* stats;  //!< Fragment statistics
+  uint32_t bfr_size;              //!< Uncompressed buffer size
+  uint32_t compressed_size;       //!< Compressed buffer size
+  uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
+  uint32_t page_headers_size;     //!< Sum of size of all page headers
+  size_type start_row;            //!< First row of chunk
+  uint32_t num_rows;              //!< Number of rows in chunk
   size_type num_values;     //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   EncPage* pages;           //!< Ptr to pages that belong to this chunk
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index bde73c3dd96..a2db0de26bb 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1673,7 +1673,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
     // - we will be doing a chunked read
     gpu::ComputePageSizes(pages,
                           chunks,
-                          0,                     // 0-max size_t. process all possible rows
+                          0,  // 0-max size_t. process all possible rows
                           std::numeric_limits<size_t>::max(),
                           true,                  // compute num_rows
                           chunk_read_limit > 0,  // compute string sizes
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 8210f3114d6..ae025b1a213 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -73,7 +73,7 @@ left_join(table_view const& left_input,
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input, right_input},                // these should match
+    {left_input, right_input},  // these should match
     stream,
     rmm::mr::get_current_device_resource());  // temporary objects returned
   // now rebuild the table views with the updated ones
@@ -98,7 +98,7 @@ full_join(table_view const& left_input,
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input, right_input},                // these should match
+    {left_input, right_input},  // these should match
     stream,
     rmm::mr::get_current_device_resource());  // temporary objects returned
   // now rebuild the table views with the updated ones
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 2ce55e10fb1..9e8b75ae3b6 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -459,7 +459,7 @@ __global__ void generate_cluster_limits_kernel(int delta,
     int adjusted_w_index       = nearest_w_index;
     if ((last_inserted_index < 0) ||  // if we haven't inserted anything yet
         (nearest_w_index ==
-         last_inserted_index)) {      // if we land in the same bucket as the previous cap
+         last_inserted_index)) {  // if we land in the same bucket as the previous cap
 
       // force the value into this bucket
       adjusted_w_index = (last_inserted_index == group_size - 1)
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh
index 9f74a961e12..39d15ed716f 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cuh
+++ b/cpp/src/rolling/detail/rolling_collect_list.cuh
@@ -116,7 +116,7 @@ std::unique_ptr<column> create_collect_gather_map(column_view const& child_offse
     thrust::make_counting_iterator<size_type>(per_row_mapping.size()),
     gather_map->mutable_view().template begin<size_type>(),
     [d_offsets =
-       child_offsets.template begin<size_type>(),    // E.g. [0,   2,     5,     8,     11, 13]
+       child_offsets.template begin<size_type>(),  // E.g. [0,   2,     5,     8,     11, 13]
      d_groups =
        per_row_mapping.template begin<size_type>(),  // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4]
      d_prev = preceding_iter] __device__(auto i) {
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index b87fb80fcc2..0c0ad0ad29e 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -139,9 +139,9 @@ struct filter_chars_fn {
   {
     auto const code_point = detail::utf8_to_codepoint(ch);
     auto const flag       = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;
-    if (flag == 0)                       // all types pass unless specifically identified
+    if (flag == 0)  // all types pass unless specifically identified
       return (types_to_remove == ALL_TYPES);
-    if (types_to_keep == ALL_TYPES)      // filter case
+    if (types_to_keep == ALL_TYPES)  // filter case
       return (types_to_remove & flag) != 0;
     return (types_to_keep & flag) == 0;  // keep case
   }
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index cca06ca0739..8a953d778ed 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -317,8 +317,8 @@ struct parse_datetime {
           bytes_read -= left;
           break;
         }
-        case 'u': [[fallthrough]];      // day of week: Mon(1)-Sat(6),Sun(7)
-        case 'w': {                     // day of week; Sun(0),Mon(1)-Sat(6)
+        case 'u': [[fallthrough]];  // day of week: Mon(1)-Sat(6),Sun(7)
+        case 'w': {                 // day of week; Sun(0),Mon(1)-Sat(6)
           auto const [weekday, left] = parse_int(ptr, item.length);
           timeparts.weekday          =  // 0 is mapped to 7 for chrono library
             static_cast<int8_t>((item.value == 'w' && weekday == 0) ? 7 : weekday);
@@ -1000,7 +1000,7 @@ struct datetime_formatter_fn {
         case 'S':  // second
           copy_value = timeparts.second;
           break;
-        case 'f':                                 // sub-second
+        case 'f':  // sub-second
         {
           char subsecond_digits[] = "000000000";  // 9 max digits
           int const digits        = [] {
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 863f76b9b98..6ab70825a6b 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -576,7 +576,7 @@ struct parse_duration {
           item_length++;  // :
           timeparts->second = parse_second(ptr + item_length, item_length);
           break;
-        case 'r':         // hh:MM:SS AM/PM
+        case 'r':  // hh:MM:SS AM/PM
           timeparts->hour = parse_hour(ptr, item_length);
           item_length++;  // :
           timeparts->minute = parse_minute(ptr + item_length, item_length);
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index ab1e6870937..32167589ab4 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -284,7 +284,7 @@ struct ftos_converter {
       while (pb != buffer)  // reverses the digits
         *ptr++ = *--pb;     // e.g. 54321 -> 12345
     } else
-      *ptr++ = '0';         // always include at least .0
+      *ptr++ = '0';  // always include at least .0
     // exponent
     if (exp10) {
       *ptr++ = 'e';
@@ -310,7 +310,7 @@ struct ftos_converter {
   {
     if (std::isnan(value)) return 3;  // NaN
     bool bneg = false;
-    if (signbit(value)) {             // handles -0.0 too
+    if (signbit(value)) {  // handles -0.0 too
       value = -value;
       bneg  = true;
     }
@@ -337,7 +337,7 @@ struct ftos_converter {
       ++count;  // always include .0
     // exponent
     if (exp10) {
-      count += 2;                  // 'e±'
+      count += 2;  // 'e±'
       if (exp10 < 0) exp10 = -exp10;
       count += (int)(exp10 < 10);  // padding
       while (exp10 > 0) {
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 260c3393f3c..5597d2831c0 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -76,7 +76,7 @@ struct string_to_integer_check_fn {
       auto const digit       = static_cast<IntegerType>(chr - '0');
       auto const bound_check = (bound_val - sign * digit) / IntegerType{10} * sign;
       if (value > bound_check) return false;
-      value = value* IntegerType{10} + digit;
+      value = value * IntegerType{10} + digit;
     }
 
     return true;
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 4606aba6d17..adb72cb0263 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -197,7 +197,7 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
                       if (d_str.empty()) return false;
                       constexpr int max_ip = 255;  // values must be in [0,255]
                       int ip_vals[4]       = {-1, -1, -1, -1};
-                      int ipv_idx          = 0;    // index into ip_vals
+                      int ipv_idx          = 0;  // index into ip_vals
                       for (auto const ch : d_str) {
                         if ((ch >= '0') && (ch <= '9')) {
                           auto const ip_val    = ip_vals[ipv_idx];
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 71b6c09310e..9efa148cfd2 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -107,9 +107,9 @@ struct url_encoder_fn {
             out_ptr = copy_and_increment(out_ptr, hex, 2);  // add them to the output
           }
         }
-      } else                       // these are to be utf-8 url-encoded
+      } else  // these are to be utf-8 url-encoded
       {
-        uint8_t char_bytes[4];     // holds utf-8 bytes for one character
+        uint8_t char_bytes[4];  // holds utf-8 bytes for one character
         size_type char_width = from_char_utf8(ch, reinterpret_cast<char*>(char_bytes));
         nbytes += char_width * 3;  // '%' plus 2 hex chars per byte (example: é is %C3%A9)
         // process each byte in this current character
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 2d2691e0518..c56752f5429 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -984,7 +984,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       col.size(),
       rmm::device_buffer{0, stream, mr},  // no data
       cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr),
-      col.size());                        // null count
+      col.size());  // null count
   }
 
   constexpr int block_size = 512;
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 5fd098a872e..b7a7f19369d 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -184,9 +184,9 @@ class regex_parser {
   int32_t _id_cclass_d{-1};  // digits [0-9]
   int32_t _id_cclass_D{-1};  // not digits
 
-  char32_t _chr{};           // last lex'd char
-  int32_t _cclass_id{};      // last lex'd class
-  int16_t _min_count{};      // data for counted operators
+  char32_t _chr{};       // last lex'd char
+  int32_t _cclass_id{};  // last lex'd class
+  int16_t _min_count{};  // data for counted operators
   int16_t _max_count{};
 
   std::vector<Item> _items;
@@ -361,9 +361,9 @@ class regex_parser {
         auto [q, n_chr] = next_char();
         if (n_chr == 0) { return 0; }  // malformed: '[x-'
 
-        if (!q && n_chr == ']') {      // handles: '[x-]'
+        if (!q && n_chr == ']') {  // handles: '[x-]'
           literals.push_back(chr);
-          literals.push_back(chr);     // add '-' as literal
+          literals.push_back(chr);  // add '-' as literal
           break;
         }
         // normal case: '[a-z]'
@@ -749,7 +749,7 @@ class regex_parser {
           // infinite repeats
           if (n > 0) {  // append '+' after last repetition
             out.push_back(regex_parser::Item{item.type == COUNTED ? PLUS : PLUS_LAZY, 0});
-          } else {      // copy it once then append '*'
+          } else {  // copy it once then append '*'
             out.insert(out.end(), begin, end);
             out.push_back(regex_parser::Item{item.type == COUNTED ? STAR : STAR_LAZY, 0});
           }
@@ -1095,7 +1095,7 @@ void reprog::build_start_ids()
     ids.pop();
     reinst const& inst = _insts[id];
     if (inst.type == OR) {
-      if (inst.u2.left_id != id)   // prevents infinite while-loop here
+      if (inst.u2.left_id != id)  // prevents infinite while-loop here
         ids.push(inst.u2.left_id);
       if (inst.u1.right_id != id)  // prevents infinite while-loop here
         ids.push(inst.u1.right_id);
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index aa2cb363b80..ab912ace0df 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -77,16 +77,16 @@ constexpr int32_t NCCLASS_D{1 << 5};  // not CCLASS_D or '\n'
  * @brief Structure of an encoded regex instruction
  */
 struct reinst {
-  int32_t type;       /* operator type or instruction type */
+  int32_t type; /* operator type or instruction type */
   union {
     int32_t cls_id;   /* class pointer */
     char32_t c;       /* character */
     int32_t subid;    /* sub-expression id for RBRA and LBRA */
     int32_t right_id; /* right child of OR */
   } u1;
-  union {             /* regexec relies on these two being in the same union */
-    int32_t left_id;  /* left child of OR */
-    int32_t next_id;  /* next instruction for CAT & LBRA */
+  union {            /* regexec relies on these two being in the same union */
+    int32_t left_id; /* left child of OR */
+    int32_t next_id; /* next instruction for CAT & LBRA */
   } u2;
   int32_t reserved4;
 };
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 19d82380350..c1abbd78b43 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -253,21 +253,21 @@ class reprog_device {
 
   reprog_device(reprog const&);
 
-  int32_t _startinst_id;              // first instruction id
-  int32_t _num_capturing_groups;      // instruction groups
-  int32_t _insts_count;               // number of instructions
-  int32_t _starts_count;              // number of start-insts ids
-  int32_t _classes_count;             // number of classes
-  int32_t _max_insts;                 // for partitioning working memory
+  int32_t _startinst_id;          // first instruction id
+  int32_t _num_capturing_groups;  // instruction groups
+  int32_t _insts_count;           // number of instructions
+  int32_t _starts_count;          // number of start-insts ids
+  int32_t _classes_count;         // number of classes
+  int32_t _max_insts;             // for partitioning working memory
 
   uint8_t const* _codepoint_flags{};  // table of character types
   reinst const* _insts{};             // array of regex instructions
   int32_t const* _startinst_ids{};    // array of start instruction ids
   reclass_device const* _classes{};   // array of regex classes
 
-  std::size_t _prog_size{};           // total size of this instance
-  void* _buffer{};                    // working memory buffer
-  int32_t _thread_count{};            // threads available in working memory
+  std::size_t _prog_size{};  // total size of this instance
+  void* _buffer{};           // working memory buffer
+  int32_t _thread_count{};   // threads available in working memory
 };
 
 /**
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index c5205ae7789..ce12dc17aa4 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -146,17 +146,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch,
   uint32_t codept = utf8_to_codepoint(ch);
   if (codept > 0x00'FFFF) return false;
   int8_t fl = codepoint_flags[codept];
-  if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl)))                    // \w
+  if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl)))  // \w
     return true;
-  if ((builtins & CCLASS_S) && IS_SPACE(fl))                                        // \s
+  if ((builtins & CCLASS_S) && IS_SPACE(fl))  // \s
     return true;
-  if ((builtins & CCLASS_D) && IS_DIGIT(fl))                                        // \d
+  if ((builtins & CCLASS_D) && IS_DIGIT(fl))  // \d
     return true;
   if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl)))  // \W
     return true;
-  if ((builtins & NCCLASS_S) && !IS_SPACE(fl))                                      // \S
+  if ((builtins & NCCLASS_S) && !IS_SPACE(fl))  // \S
     return true;
-  if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl)))                    // \D
+  if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl)))  // \D
     return true;
   //
   return false;
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 460074a5296..81ddb937be5 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -68,7 +68,7 @@ struct replace_regex_fn {
       if (!match) { break; }  // no more matches
 
       auto const [start_pos, end_pos] = match_positions_to_bytes(*match, d_str, last_pos);
-      nbytes += d_repl.size_bytes() - (end_pos - start_pos);               // add new size
+      nbytes += d_repl.size_bytes() - (end_pos - start_pos);  // add new size
 
       if (out_ptr) {                                                       // replace:
                                                                            // i:bbbbsssseeee
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 099f5978992..0c7d119ea38 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -170,7 +170,7 @@ struct rpartition_fn : public partition_fn {
       --itr;
       pos = check_delimiter(idx, d_str, itr);
     }
-    if (pos < 0)                                        // delimiter not found
+    if (pos < 0)  // delimiter not found
     {
       d_indices_left[idx]  = string_index_pair{"", 0};  // two empty
       d_indices_delim[idx] = string_index_pair{"", 0};  // strings
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index e76d8ac1c60..dc0b04af388 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -190,7 +190,7 @@ struct split_tokenizer_fn : base_split_tokenizer<split_tokenizer_fn> {
                                  device_span<size_type const> d_delimiters,
                                  device_span<string_index_pair> d_tokens) const
   {
-    auto const base_ptr    = get_base_ptr();                // d_positions values based on this
+    auto const base_ptr    = get_base_ptr();  // d_positions values based on this
     auto str_ptr           = d_str.data();
     auto const str_end     = str_ptr + d_str.size_bytes();  // end of the string
     auto const token_count = static_cast<size_type>(d_tokens.size());
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 9aeb6b69bdc..3be5937297f 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -91,7 +91,7 @@ struct token_reader_fn {
       } else {
         if (direction == split_direction::FORWARD) { break; }  // we are done
         for (auto l = 0; l < token_idx - 1; ++l) {
-          d_result[l] = d_result[l + 1];                       // shift left
+          d_result[l] = d_result[l + 1];  // shift left
         }
         d_result[token_idx - 1] = token;
       }
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 57a868485df..c8c68d19ce6 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -86,9 +86,9 @@ thread_safe_per_context_cache<special_case_mapping> d_special_case_mappings;
 
 }  // namespace
 
-   /**
-    * @copydoc cudf::strings::detail::get_character_flags_table
-    */
+/**
+ * @copydoc cudf::strings::detail::get_character_flags_table
+ */
 character_flags_table_type const* get_character_flags_table()
 {
   return d_character_codepoint_flags.find_or_initialize([&](void) {
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 78dfb6bf1a6..1b07b0785f5 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -70,7 +70,7 @@ struct normalize_spaces_fn {
     cudf::string_view const single_space(" ", 1);
     auto const d_str = d_strings.element<cudf::string_view>(idx);
     char* buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    char* optr       = buffer;   // running output pointer
+    char* optr       = buffer;  // running output pointer
 
     cudf::size_type nbytes = 0;  // holds the number of bytes per output string
 
@@ -146,7 +146,7 @@ struct codepoint_to_utf8_fn {
     char* out_ptr = d_chars + d_offsets[idx];
     for (uint32_t jdx = 0; jdx < count; ++jdx) {
       uint32_t code_point = *str_cps++;
-      if (code_point < UTF8_1BYTE)         // ASCII range
+      if (code_point < UTF8_1BYTE)  // ASCII range
         *out_ptr++ = static_cast<char>(code_point);
       else if (code_point < UTF8_2BYTE) {  // create two-byte UTF-8
         // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index d122f048a4e..34916e121dc 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -114,7 +114,7 @@ using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_v
  * time to fill in the allocated output buffer for each string.
  */
 struct replace_tokens_fn : base_token_replacer_fn {
-  strings_iterator d_targets_begin;               ///< strings to search for
+  strings_iterator d_targets_begin;  ///< strings to search for
   strings_iterator d_targets_end;
   cudf::column_device_view const d_replacements;  ///< replacement strings
 
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 4c4f5b3a4b1..13c744ac6bd 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -261,7 +261,7 @@ struct byte_pair_encoding_fn {
           while (itr < end) {
             auto rhs = next_substr(itr, end, d_str);
             if (d_pair.first == lhs && d_pair.second == rhs) {
-              *itr = 0;                   // removes the pair from this string
+              *itr = 0;  // removes the pair from this string
               itr += rhs.size_bytes();
               if (itr >= end) { break; }  // done checking for pairs
               // skip to the next adjacent pair
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index 1f1b90b3f49..db6ad2e2dd2 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -93,7 +93,7 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),  // capacity is 2x;
     cuco::empty_key{-1},
-    cuco::empty_value{-1},                  // empty value is not used
+    cuco::empty_value{-1},  // empty value is not used
     bpe_equal{input},
     probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh
index fbd2d1efcff..a84e94a6924 100644
--- a/cpp/src/text/utilities/tokenize_ops.cuh
+++ b/cpp/src/text/utilities/tokenize_ops.cuh
@@ -230,7 +230,7 @@ struct multi_delimiter_strings_tokenizer {
         });
       if (itr_find != delimiters_end) {  // found delimiter
         auto token_size = static_cast<cudf::size_type>((curr_ptr - data_ptr) - last_pos);
-        if (token_size > 0)              // we only care about non-zero sized tokens
+        if (token_size > 0)  // we only care about non-zero sized tokens
         {
           if (d_str_tokens)
             d_str_tokens[token_idx] = string_index_pair{data_ptr + last_pos, token_size};
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 991473c5023..f2909f870aa 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -374,7 +374,7 @@ TEST_F(GroupbyMergeListsTest, StringsColumnInput)
                 "" /*NULL*/,
                 "" /*NULL*/,
                 "German Shepherd",
-                ""                                                /*NULL*/
+                "" /*NULL*/
               },
               nulls_at({3, 4, 5, 7})},                            // key = "dog"
     lists_col{{"Whale", "" /*NULL*/, "Polar Bear"}, null_at(1)},  // key = "unknown"
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 67ff61563bb..5fc7e68b524 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -333,7 +333,7 @@ TEST_F(GroupbyMergeSetsTest, StringsColumnInput)
     lists_col{{"" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, all_nulls()}  // key = "dog"
   };
   auto const lists3 = lists_col{
-    lists_col{"Fuji", "Red Delicious"},           // key = "apple"
+    lists_col{"Fuji", "Red Delicious"},  // key = "apple"
     lists_col{{"" /*NULL*/, "Corgi", "German Shepherd", "" /*NULL*/, "Golden Retriever"},
               nulls_at({0, 3})},                  // key = "dog"
     lists_col{{"Seeedless", "Mini"}, no_nulls()}  // key = "water melon"
@@ -343,14 +343,14 @@ TEST_F(GroupbyMergeSetsTest, StringsColumnInput)
     merge_sets(vcol_views{keys1, keys2, keys3}, vcol_views{lists1, lists2, lists3});
   auto const expected_keys  = strings_col{"apple", "banana", "dog", "unknown", "water melon"};
   auto const expected_lists = lists_col{
-    lists_col{"Fuji", "Honey Bee", "Red Delicious"},                         // key = "apple"
-    lists_col{"Green", "Yellow"},                                            // key = "banana"
+    lists_col{"Fuji", "Honey Bee", "Red Delicious"},  // key = "apple"
+    lists_col{"Green", "Yellow"},                     // key = "banana"
     lists_col{{
                 "Corgi", "German Shepherd", "Golden Retriever", "Poodle", "" /*NULL*/
               },
-              null_at(4)},                                                   // key = "dog"
-    lists_col{{"Polar Bear", "Whale", "" /*NULL*/}, null_at(2)},             // key = "unknown"
-    lists_col{{"Mini", "Seeedless"}, no_nulls()}                             // key = "water melon"
+              null_at(4)},                                        // key = "dog"
+    lists_col{{"Polar Bear", "Whale", "" /*NULL*/}, null_at(2)},  // key = "unknown"
+    lists_col{{"Mini", "Seeedless"}, no_nulls()}                  // key = "water melon"
   };
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *out_keys, verbosity);
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 64aca091686..81e0e12eeb9 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -2166,7 +2166,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   cudf::io::table_input_metadata metadata(table1);
   metadata.column_metadata[0].set_nullability(true);  // List is nullable at first (root) level
   metadata.column_metadata[0].child(1).set_nullability(
-    false);                                           // non-nullable at second (leaf) level
+    false);  // non-nullable at second (leaf) level
   metadata.column_metadata[1].set_nullability(true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedListNullable.parquet");
@@ -5880,7 +5880,7 @@ TEST_F(ParquetMetadataReaderTest, TestNested)
   EXPECT_EQ(out_map_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE);  // map
 
   ASSERT_EQ(out_map_col.num_children(), 1);
-  EXPECT_EQ(out_map_col.child(0).name(), "key_value");       // key_value (named in parquet writer)
+  EXPECT_EQ(out_map_col.child(0).name(), "key_value");  // key_value (named in parquet writer)
   ASSERT_EQ(out_map_col.child(0).num_children(), 2);
   EXPECT_EQ(out_map_col.child(0).child(0).name(), "key");    // key (named in parquet writer)
   EXPECT_EQ(out_map_col.child(0).child(1).name(), "value");  // value (named in parquet writer)
@@ -5897,7 +5897,7 @@ TEST_F(ParquetMetadataReaderTest, TestNested)
   ASSERT_EQ(out_list_col.child(0).num_children(), 1);
 
   auto const& out_list_struct_col = out_list_col.child(0).child(0);
-  EXPECT_EQ(out_list_struct_col.name(), "element");        // elements (named in parquet writer)
+  EXPECT_EQ(out_list_struct_col.name(), "element");  // elements (named in parquet writer)
   EXPECT_EQ(out_list_struct_col.type_kind(),
             cudf::io::parquet::TypeKind::UNDEFINED_TYPE);  // struct
   ASSERT_EQ(out_list_struct_col.num_children(), 2);
diff --git a/cpp/tests/lists/reverse_tests.cpp b/cpp/tests/lists/reverse_tests.cpp
index a899d387c3e..00dc13c5812 100644
--- a/cpp/tests/lists/reverse_tests.cpp
+++ b/cpp/tests/lists/reverse_tests.cpp
@@ -370,8 +370,8 @@ TYPED_TEST(ListsReverseTypedTest, InputListsOfStructsWithNulls)
                                          "Kiwi",
                                          "Cherry",
                                          "Banana",
-                                         "",        /*NULL*/
-                                         "",        /*NULL*/
+                                         "", /*NULL*/
+                                         "", /*NULL*/
                                          "Apple",
                                          "",        /*NULL*/
                                          "Banana",  // end list1
@@ -436,8 +436,8 @@ TYPED_TEST(ListsReverseTypedTest, InputListsOfStructsWithNulls)
                                          "Kiwi",
                                          "Cherry",
                                          "Banana",
-                                         "",        /*NULL*/
-                                         "",        /*NULL*/
+                                         "", /*NULL*/
+                                         "", /*NULL*/
                                          "Apple",
                                          "",        /*NULL*/
                                          "Banana",  // end list1
diff --git a/cpp/tests/lists/set_operations/difference_distinct_tests.cpp b/cpp/tests/lists/set_operations/difference_distinct_tests.cpp
index bf7ebc902ba..84c51f256b7 100644
--- a/cpp/tests/lists/set_operations/difference_distinct_tests.cpp
+++ b/cpp/tests/lists/set_operations/difference_distinct_tests.cpp
@@ -571,7 +571,7 @@ TEST_F(SetDifferenceTest, InputListsOfNestedStructsHaveNull)
                                        "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "Apple", "Banana",
                                        "Cherry",    "Kiwi",  // end list1
                                        "" /*NULL*/, "Bear",      "Cat",       "Dog",   "Duck",
-                                       "Panda",              // end list2
+                                       "Panda",  // end list2
                                        "ÁÁÁ",       "ÉÉÉÉÉ",     "ÁBC",       "ÁÁÁ",   "ÍÍÍÍÍ",
                                        "" /*NULL*/, "XYZ",
                                        "ÁBC"  // end list3
diff --git a/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp b/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp
index dbccf06036b..11f98af3520 100644
--- a/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp
+++ b/cpp/tests/lists/set_operations/intersect_distinct_tests.cpp
@@ -514,7 +514,7 @@ TEST_F(SetIntersectTest, InputListsOfNestedStructsHaveNull)
                                       null,  // end list1
                                       null,  // end list2
                                       null,
-                                      null   // end list3
+                                      null  // end list3
                                     },
                                     all_nulls()};
       auto grandchild2 = strings_col{{
@@ -522,7 +522,7 @@ TEST_F(SetIntersectTest, InputListsOfNestedStructsHaveNull)
                                        "Apple",      // end list1
                                        "" /*NULL*/,  // end list2
                                        "ÁÁÁ",
-                                       "ÉÉÉÉÉ"       // end list3
+                                       "ÉÉÉÉÉ"  // end list3
                                      },
                                      nulls_at({0, 2})};
       auto child1      = structs_col{{grandchild1, grandchild2}, null_at(0)};
diff --git a/cpp/tests/lists/set_operations/union_distinct_tests.cpp b/cpp/tests/lists/set_operations/union_distinct_tests.cpp
index 5cc0897351d..e33ea31541b 100644
--- a/cpp/tests/lists/set_operations/union_distinct_tests.cpp
+++ b/cpp/tests/lists/set_operations/union_distinct_tests.cpp
@@ -560,7 +560,7 @@ TEST_F(SetUnionTest, InputListsOfNestedStructsHaveNull)
       auto grandchild2 =
         strings_col{{
                       "" /*NULL*/, "Apple",     "Banana", "Cherry", "Kiwi",  "Banana",    "Cherry",
-                      "Kiwi",                                       // end list1
+                      "Kiwi",  // end list1
                       "" /*NULL*/, "Bear",      "Cat",    "Dog",    "Duck",  "Panda",     "Bear",
                       "Cat",       "Dog",       "Duck",   "Panda",  // end list2
 
@@ -597,7 +597,7 @@ TEST_F(SetUnionTest, InputListsOfNestedStructsHaveNull)
         {
           "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "" /*NULL*/, "Apple",
           "Apple",     "Banana",    "Cherry",    "Kiwi",      "Banana",    "Cherry",
-          "Kiwi",                                                       // end list1
+          "Kiwi",  // end list1
           "" /*NULL*/, "" /*NULL*/, "Bear",      "Cat",       "Dog",       "Duck",      "Panda",
           "Bear",      "Cat",       "Dog",       "Duck",      "Panda",  // end list2
           "ÁÁÁ",       "ÁÁÁ",       "ÉÉÉÉÉ",     "ÉÉÉÉÉ",     "ÁBC",       "ÁÁÁ",       "ÍÍÍÍÍ",
diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp
index 57d1714c255..fbc637f9315 100644
--- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp
+++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp
@@ -529,7 +529,7 @@ TEST_F(ListDistinctTest, InputListsOfStructsHaveNull)
                               2,
                               3,
                               3,
-                              3},     // end list3
+                              3},  // end list3
                              nulls_at({1, 6, 12, 13})};
     auto child2 = strings_col{{       // begin list1
                                "XXX", /*NULL*/
@@ -551,7 +551,7 @@ TEST_F(ListDistinctTest, InputListsOfStructsHaveNull)
                                "ÁBC",
                                "ÁÁÁ",
                                "ÍÍÍÍÍ",
-                               "",      /*NULL*/
+                               "", /*NULL*/
                                "XYZ",
                                "ÁBC"},  // end list3
                               nulls_at({6, 17})};
@@ -670,7 +670,7 @@ TEST_F(ListDistinctTest, InputListsOfNestedStructsHaveNull)
                                      "ÁBC",
                                      "ÁÁÁ",
                                      "ÍÍÍÍÍ",
-                                     "",    /*NULL*/
+                                     "", /*NULL*/
                                      "XYZ",
                                      "ÁBC"  // end list3
                                    },
@@ -729,8 +729,8 @@ TEST_F(ListDistinctTest, InputListsOfStructsOfLists)
                                  floats_lists{3, 4, 5},  // end list2
                                                          // begin list3
                                  floats_lists{},
-                                 floats_lists{},         // end list3
-                                                         // begin list4
+                                 floats_lists{},  // end list3
+                                                  // begin list4
                                  floats_lists{6, 7},
                                  floats_lists{6, 7},
                                  floats_lists{6, 7}};
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index eba6c961bbb..e8ea9d619c5 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -806,7 +806,7 @@ TYPED_TEST(ListsColumnsInterleaveTypedTest, SlicedInputListsOfListsWithNulls)
     ListsCol{ListsCol{{null, 11}, null_at(0)},
              ListsCol{{22, null, null}, nulls_at({1, 2})}},  // don't care
     ListsCol{ListsCol{{null, 11}, null_at(0)},
-             ListsCol{{22, null, null}, nulls_at({1, 2})}}   // don't care
+             ListsCol{{22, null, null}, nulls_at({1, 2})}}  // don't care
   };
 
   auto const col1 = cudf::slice(col1_original, {3, 6})[0];
diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp
index 585383f28f8..eed9db1fe04 100644
--- a/cpp/tests/rolling/range_rolling_window_test.cpp
+++ b/cpp/tests/rolling/range_rolling_window_test.cpp
@@ -91,7 +91,7 @@ struct window_exec {
   ScalarT preceding;             // Preceding window scalar.
   ScalarT following;             // Following window scalar.
   cudf::size_type min_periods = 1;
-};                               // struct window_exec;
+};  // struct window_exec;
 
 struct RangeRollingTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index b3f98eb54b9..da9666cbc74 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -270,7 +270,7 @@ TEST_F(SegmentedSortInt, Sliced)
   column_wrapper<int> expected2{{0, 1, 3, 2, 4, 5, 6}};
   column_wrapper<int> expected3{{0, 1, 2, 3, 4, 5, 6}};
   // clang-format on
-  auto slice = cudf::slice(col1, {4, 11})[0];          // 7 elements
+  auto slice = cudf::slice(col1, {4, 11})[0];  // 7 elements
   cudf::table_view input{{slice}};
   auto seg_slice = cudf::slice(segments2, {2, 4})[0];  // 2 elements
 
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index a16da41af7a..c595977c269 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -50,17 +50,17 @@ TEST_P(CharsTypes, AllTypes)
                                      "\t\r\n\f "};
 
   bool expecteds[] = {false, false, false, false, false, false, false, false,
-                      false, false, false, false, false, true,  false, false,   // decimal
+                      false, false, false, false, false, true,  false, false,  // decimal
                       false, false, false, false, false, false, false, false,
-                      false, true,  false, true,  false, true,  false, false,   // numeric
+                      false, true,  false, true,  false, true,  false, false,  // numeric
                       false, false, false, false, false, false, false, false,
-                      false, false, false, true,  false, true,  false, false,   // digit
+                      false, false, false, true,  false, true,  false, false,  // digit
                       true,  true,  false, true,  false, false, false, false,
-                      false, false, false, false, false, false, true,  false,   // alpha
+                      false, false, false, false, false, false, true,  false,  // alpha
                       false, false, false, false, false, false, false, false,
-                      false, false, false, false, false, false, false, true,    // space
+                      false, false, false, false, false, false, false, true,  // space
                       false, false, false, true,  false, false, false, false,
-                      false, false, false, false, false, false, false, false,   // upper
+                      false, false, false, false, false, false, false, false,  // upper
                       false, true,  false, false, false, false, false, false,
                       false, false, false, false, false, false, true,  false};  // lower
 
diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp
index 0c7a1ad8042..1902f907f43 100644
--- a/cpp/tests/strings/durations_tests.cpp
+++ b/cpp/tests/strings/durations_tests.cpp
@@ -398,7 +398,7 @@ TEST_F(StringsDurationsTest, ParseSingle)
                                                 "-59",
                                                 "999",
                                                 "-999",
-                                                "",   // error
+                                                "",  // error
                                                 "01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
@@ -449,7 +449,7 @@ TEST_F(StringsDurationsTest, ParseMultiple)
                                                 "-59:00:00",
                                                 "999:00:00",
                                                 "-999:00:00",
-                                                "",   // error
+                                                "",  // error
                                                 "01:01:01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
@@ -503,7 +503,7 @@ TEST_F(StringsDurationsTest, ParseSubsecond)
                                                 "-59:00:00",
                                                 "999:00:00",
                                                 "-999:00:00",
-                                                "",   // error
+                                                "",  // error
                                                 "01:01:01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
@@ -660,7 +660,7 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier)
                                                  "09:00 AM",  // error
                                                  "",          // error
                                                  "01:01:01",
-                                                 ""};         // error
+                                                 ""};  // error
 
   cudf::test::fixed_width_column_wrapper<cudf::duration_s, int64_t> expected_s3(
     {0,
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index bae402155e9..620e0bfe8de 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -440,7 +440,7 @@ class corresponding_rows_not_equivalent {
 
         // Must handle inf and nan separately
         if (std::isinf(x) || std::isinf(y)) {
-          return x != y;                          // comparison of (inf==inf) returns true
+          return x != y;  // comparison of (inf==inf) returns true
         } else if (std::isnan(x) || std::isnan(y)) {
           return std::isnan(x) != std::isnan(y);  // comparison of (nan==nan) returns false
         } else {

From 97501d87e2070e8f07eb17b2c5e59742c490c6b1 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 20 Sep 2023 07:42:20 +0530
Subject: [PATCH 185/230] Long string optimization for string column parsing in
 JSON reader (#13803)

closes #13724

In old code, 1 thread per string is allocated for parsing a string column.
For longer strings (>1024), the runtime of 1-thread-per-string to decode is taking too long even for few strings.

In this change, 1 warp per string is used for parsing for strings length <=1024 and 1 block per string for string length >1024. If max string length < 128, 1 thread per string is used as usual.

256 threads_per_block is used for both kernels.
Code for 1-warp-per-string and 1-block-per-string is similar, but only varies with warp-wide and block-wide primitives for reduction and scan operations. shared memory usage will differ slightly too.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Elias Stehle (https://github.com/elstehle)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13803
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cudf/io/detail/data_casting.cuh   | 431 --------
 cpp/src/io/json/json_column.cu                |  39 +-
 cpp/src/io/json/nested_json_gpu.cu            |  22 +-
 cpp/src/io/json/write_json.cu                 |   3 +-
 cpp/src/io/utilities/data_casting.cu          | 987 ++++++++++++++++++
 cpp/src/io/utilities/parsing_utils.cuh        |  24 +-
 cpp/src/io/utilities/string_parsing.hpp       |  79 ++
 .../{type_inference.cuh => type_inference.cu} |  57 +-
 cpp/tests/io/json_test.cpp                    | 119 +++
 cpp/tests/io/json_type_cast_test.cu           | 189 +++-
 cpp/tests/io/type_inference_test.cu           |  30 +-
 12 files changed, 1395 insertions(+), 587 deletions(-)
 delete mode 100644 cpp/include/cudf/io/detail/data_casting.cuh
 create mode 100644 cpp/src/io/utilities/data_casting.cu
 create mode 100644 cpp/src/io/utilities/string_parsing.hpp
 rename cpp/src/io/utilities/{type_inference.cuh => type_inference.cu} (84%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 900e9eed98e..a84f7bd5224 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -413,11 +413,13 @@ add_library(
   src/io/utilities/arrow_io_source.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
+  src/io/utilities/data_casting.cu
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
   src/io/utilities/file_io_utilities.cpp
   src/io/utilities/parsing_utils.cu
   src/io/utilities/row_selection.cpp
+  src/io/utilities/type_inference.cu
   src/io/utilities/trie.cu
   src/jit/cache.cpp
   src/jit/parser.cpp
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
deleted file mode 100644
index b7ee5e05e96..00000000000
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <io/utilities/parsing_utils.cuh>
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/detail/utf8.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cub/cub.cuh>
-
-#include <memory>
-
-namespace cudf::io::json::detail {
-
-// Unicode code point escape sequence
-static constexpr char UNICODE_SEQ = 0x7F;
-
-// Invalid escape sequence
-static constexpr char NON_ESCAPE_CHAR = 0x7E;
-
-// Unicode code point escape sequence prefix comprises '\' and 'u' characters
-static constexpr size_type UNICODE_ESC_PREFIX = 2;
-
-// Unicode code point escape sequence comprises four hex characters
-static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4;
-
-// A unicode code point escape sequence is \uXXXX
-static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT;
-
-static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800;
-static constexpr auto UTF16_HIGH_SURROGATE_END   = 0xDC00;
-static constexpr auto UTF16_LOW_SURROGATE_BEGIN  = 0xDC00;
-static constexpr auto UTF16_LOW_SURROGATE_END    = 0xE000;
-
-/**
- * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or
- * whether type casting failed.
- */
-enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE };
-
-/**
- * @brief Providing additional information about the type casting result.
- */
-struct data_casting_result_info {
-  // Number of bytes written to output
-  size_type bytes;
-  // Whether parsing succeeded, item was parsed to null, or failed
-  data_casting_result result;
-};
-
-/**
- * @brief Returns the character to output for a given escaped character that's following a
- * backslash.
- *
- * @param escaped_char The character following the backslash.
- * @return The character to output for a given character that's following a backslash
- */
-__device__ __forceinline__ char get_escape_char(char escaped_char)
-{
-  switch (escaped_char) {
-    case '"': return '"';
-    case '\\': return '\\';
-    case '/': return '/';
-    case 'b': return '\b';
-    case 'f': return '\f';
-    case 'n': return '\n';
-    case 'r': return '\r';
-    case 't': return '\t';
-    case 'u': return UNICODE_SEQ;
-    default: return NON_ESCAPE_CHAR;
-  }
-}
-
-/**
- * @brief Returns the escaped characters for a given character.
- *
- * @param escaped_char The character to escape.
- * @return The escaped characters for a given character.
- */
-__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
-{
-  switch (escaped_char) {
-    case '"': return {'\\', '"'};
-    case '\\': return {'\\', '\\'};
-    case '/': return {'\\', '/'};
-    case '\b': return {'\\', 'b'};
-    case '\f': return {'\\', 'f'};
-    case '\n': return {'\\', 'n'};
-    case '\r': return {'\\', 'r'};
-    case '\t': return {'\\', 't'};
-    // case 'u': return UNICODE_SEQ;
-    default: return {'\0', escaped_char};
-  }
-}
-/**
- * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence
- * \uXXXX.
- *
- * @param str Pointer to the first (most-significant) hex digit
- * @return The parsed hex value if successful, -1 otherwise.
- */
-__device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
-{
-  // Prepare result
-  int32_t result = 0, base = 1;
-  constexpr int32_t hex_radix = 16;
-
-  // Iterate over hex digits right-to-left
-  size_type index = UNICODE_HEX_DIGIT_COUNT;
-  while (index-- > 0) {
-    char const ch = str[index];
-    if (ch >= '0' && ch <= '9') {
-      result += static_cast<int32_t>((ch - '0') + 0) * base;
-      base *= hex_radix;
-    } else if (ch >= 'A' && ch <= 'F') {
-      result += static_cast<int32_t>((ch - 'A') + 10) * base;
-      base *= hex_radix;
-    } else if (ch >= 'a' && ch <= 'f') {
-      result += static_cast<int32_t>((ch - 'a') + 10) * base;
-      base *= hex_radix;
-    } else {
-      return -1;
-    }
-  }
-  return result;
-}
-
-/**
- * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
- * \p out_it
- */
-constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
-{
-  auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character)
-                                         : strings::detail::from_char_utf8(character, out_it);
-  if (out_it) out_it += bytes;
-  return bytes;
-}
-
-/**
- * @brief Processes a string, replaces escape sequences and optionally strips off the quote
- * characters.
- *
- * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to
- * char
- * @param in_begin Iterator to the first item to process
- * @param in_end Iterator to one past the last item to process
- * @param d_buffer Output character buffer to the first item to write
- * @param options Settings for controlling string processing behavior
- * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is
- * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether
- * parsing succeeded, item was parsed to null, or failed.
- */
-template <typename in_iterator_t>
-__device__ __forceinline__ data_casting_result_info
-process_string(in_iterator_t in_begin,
-               in_iterator_t in_end,
-               char* d_buffer,
-               cudf::io::parse_options_view const& options)
-{
-  int32_t bytes           = 0;
-  auto const num_in_chars = thrust::distance(in_begin, in_end);
-  // String values are indicated by keeping the quote character
-  bool const is_string_value =
-    num_in_chars >= 2LL &&
-    (options.quotechar == '\0' ||
-     (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
-
-  // Copy literal/numeric value
-  if (not is_string_value) {
-    while (in_begin != in_end) {
-      if (d_buffer) *d_buffer++ = *in_begin;
-      ++in_begin;
-      ++bytes;
-    }
-    return {bytes, data_casting_result::PARSING_SUCCESS};
-  }
-  // Whether in the original JSON this was a string value enclosed in quotes
-  // ({"a":"foo"} vs. {"a":1.23})
-  char const backslash_char = '\\';
-
-  // Escape-flag, set after encountering a backslash character
-  bool escape = false;
-
-  // Exclude beginning and ending quote chars from string range
-  if (!options.keepquotes) {
-    ++in_begin;
-    --in_end;
-  }
-
-  // Iterate over the input
-  while (in_begin != in_end) {
-    // Copy single character to output
-    if (!escape) {
-      escape = (*in_begin == backslash_char);
-      if (!escape) {
-        if (d_buffer) *d_buffer++ = *in_begin;
-        ++bytes;
-      }
-      ++in_begin;
-      continue;
-    }
-
-    // Previous char indicated beginning of escape sequence
-    // Reset escape flag for next loop iteration
-    escape = false;
-
-    // Check the character that is supposed to be escaped
-    auto escaped_char = get_escape_char(*in_begin);
-
-    // We escaped an invalid escape character -> "fail"/null for this item
-    if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; }
-
-    // Regular, single-character escape
-    if (escaped_char != UNICODE_SEQ) {
-      if (d_buffer) *d_buffer++ = escaped_char;
-      ++bytes;
-      ++in_begin;
-      continue;
-    }
-
-    // This is an escape sequence of a unicode code point: \uXXXX,
-    // where each X in XXXX represents a hex digit
-    // Skip over the 'u' char from \uXXXX to the first hex digit
-    ++in_begin;
-
-    // Make sure that there's at least 4 characters left from the
-    // input, which are expected to be hex digits
-    if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) {
-      return {bytes, data_casting_result::PARSING_FAILURE};
-    }
-
-    auto hex_val = parse_unicode_hex(in_begin);
-
-    // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item
-    if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; }
-
-    // Skip over the four hex digits
-    thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT);
-
-    // If this may be a UTF-16 encoded surrogate pair:
-    // we expect another \uXXXX sequence
-    int32_t hex_low_val = 0;
-    if (thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS &&
-        *in_begin == backslash_char && *thrust::next(in_begin) == 'u') {
-      // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low
-      // surrogate
-      hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2));
-    }
-
-    // This is indeed a UTF16 surrogate pair
-    if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
-        hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
-      // Skip over the second \uXXXX sequence
-      thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS);
-
-      // Compute UTF16-encoded code point
-      uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
-                                    (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
-      auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point);
-      bytes += write_utf8_char(utf8_chars, d_buffer);
-    }
-
-    // Just a single \uXXXX sequence
-    else {
-      auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val);
-      bytes += write_utf8_char(utf8_chars, d_buffer);
-    }
-  }
-
-  // The last character of the input is a backslash -> "fail"/null for this item
-  if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; }
-  return {bytes, data_casting_result::PARSING_SUCCESS};
-}
-
-template <typename str_tuple_it>
-struct string_parse {
-  str_tuple_it str_tuples;
-  bitmask_type* null_mask;
-  size_type* null_count_data;
-  cudf::io::parse_options_view const options;
-  size_type* d_offsets{};
-  char* d_chars{};
-
-  __device__ void operator()(size_type idx)
-  {
-    if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const in_begin     = str_tuples[idx].first;
-    auto const in_end       = in_begin + str_tuples[idx].second;
-    auto const num_in_chars = str_tuples[idx].second;
-
-    // Check if the value corresponds to the null literal
-    auto const is_null_literal =
-      (!d_chars) &&
-      serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
-    if (is_null_literal && null_mask != nullptr) {
-      clear_bit(null_mask, idx);
-      atomicAdd(null_count_data, 1);
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-
-    char* d_buffer        = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
-    if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
-      if (null_mask != nullptr) {
-        clear_bit(null_mask, idx);
-        atomicAdd(null_count_data, 1);
-      }
-      if (!d_chars) d_offsets[idx] = 0;
-    } else {
-      if (!d_chars) d_offsets[idx] = str_process_info.bytes;
-    }
-  }
-};
-/**
- * @brief Parses the data from an iterator of string views, casting it to the given target data type
- *
- * @param str_tuples Iterator returning a string view, i.e., a (ptr, length) pair
- * @param col_size The total number of items of this column
- * @param col_type The column's target data type
- * @param null_mask A null mask that renders certain items from the input invalid
- * @param options Settings for controlling the processing behavior
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr The resource to be used for device memory allocation
- * @return The column that contains the parsed data
- */
-template <typename str_tuple_it, typename B>
-std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
-                                   size_type col_size,
-                                   data_type col_type,
-                                   B&& null_mask,
-                                   size_type null_count,
-                                   cudf::io::parse_options_view const& options,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-
-  auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
-  auto null_count_data = d_null_count.data();
-
-  if (col_type == cudf::data_type{cudf::type_id::STRING}) {
-    // this utility calls the functor to build the offsets and chars columns;
-    // the bitmask and null count may be updated by parse failures
-    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
-      string_parse<decltype(str_tuples)>{
-        str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
-      col_size,
-      stream,
-      mr);
-
-    return make_strings_column(col_size,
-                               std::move(offsets),
-                               std::move(chars),
-                               d_null_count.value(stream),
-                               std::move(null_mask));
-  }
-
-  auto out_col =
-    make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr);
-  auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream);
-
-  // use existing code (`ConvertFunctor`) to convert values
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    col_size,
-    [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__(
-      size_type row) {
-      if (col.is_null(row)) { return; }
-      auto const in = str_tuples[row];
-
-      auto const is_null_literal =
-        serialized_trie_contains(options.trie_na, {in.first, static_cast<size_t>(in.second)});
-
-      if (is_null_literal) {
-        col.set_null(row);
-        atomicAdd(null_count_data, 1);
-        return;
-      }
-
-      // If this is a string value, remove quotes
-      auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar);
-
-      auto const is_parsed = cudf::type_dispatcher(col_type,
-                                                   ConvertFunctor{},
-                                                   in_begin,
-                                                   in_end,
-                                                   col.data<char>(),
-                                                   row,
-                                                   col_type,
-                                                   options,
-                                                   false);
-      if (not is_parsed) {
-        col.set_null(row);
-        atomicAdd(null_count_data, 1);
-      }
-    });
-
-  out_col->set_null_count(d_null_count.value(stream));
-
-  return out_col;
-}
-
-}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index cabf904f020..5d7fb9d6b43 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -16,14 +16,13 @@
 
 #include "nested_json.hpp"
 #include <io/utilities/parsing_utils.cuh>
-#include <io/utilities/type_inference.cuh>
+#include <io/utilities/string_parsing.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -331,23 +330,27 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
 {
   CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
-  rmm::device_uvector<thrust::pair<char const*, size_type>> string_views(num_strings, stream);
+  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
+  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
   auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
   thrust::transform(rmm::exec_policy(stream),
                     d_offset_pairs,
                     d_offset_pairs + num_strings,
-                    string_views.begin(),
-                    [data = input.data()] __device__(auto const& offsets) {
+                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
+                    [] __device__(auto const& offsets) {
                       // Note: first character for non-field columns
-                      return thrust::make_pair(
-                        data + thrust::get<0>(offsets),
+                      return thrust::make_tuple(
+                        static_cast<size_type>(thrust::get<0>(offsets)),
                         static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
                     });
 
   cudf::io::parse_options_view options_view{};
   options_view.quotechar  = '\0';  // no quotes
   options_view.keepquotes = true;
-  auto d_column_names     = parse_data(string_views.begin(),
+  auto d_offset_length_it =
+    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
+  auto d_column_names = parse_data(input.data(),
+                                   d_offset_length_it,
                                    num_strings,
                                    data_type{type_id::STRING},
                                    rmm::device_buffer{},
@@ -355,7 +358,7 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
                                    options_view,
                                    stream,
                                    rmm::mr::get_current_device_resource());
-  auto to_host            = [stream](auto const& col) {
+  auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
     auto const h_chars = cudf::detail::make_std_vector_sync<char>(
@@ -763,19 +766,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // TODO how about directly storing pair<char*, size_t> in json_column?
       auto offset_length_it =
         thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin());
-      // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference
-      auto string_ranges_it =
-        thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) {
-          return thrust::pair<json_column::row_offset_t, std::size_t>{
-            thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
-      // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
-      auto string_spans_it = thrust::make_transform_iterator(
-        offset_length_it, [data = d_input.data()] __device__(auto ip) {
-          return thrust::pair<char const*, std::size_t>{
-            data + thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
 
       data_type target_type{};
 
@@ -790,12 +780,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // Infer column type, if we don't have an explicit type for it
       else {
         target_type = cudf::io::detail::infer_data_type(
-          options.json_view(), d_input, string_ranges_it, col_size, stream);
+          options.json_view(), d_input, offset_length_it, col_size, stream);
       }
 
       auto [result_bitmask, null_count] = make_validity(json_col);
       // Convert strings to the inferred data type
-      auto col = parse_data(string_spans_it,
+      auto col = parse_data(d_input.data(),
+                            offset_length_it,
                             col_size,
                             target_type,
                             std::move(result_bitmask),
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 0b49f97597d..06ac11485cb 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -19,14 +19,13 @@
 #include <io/fst/logical_stack.cuh>
 #include <io/fst/lookup_tables.cuh>
 #include <io/utilities/parsing_utils.cuh>
-#include <io/utilities/type_inference.cuh>
+#include <io/utilities/string_parsing.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/detail/valid_if.cuh>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/detail/tokenize_json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/table/table.hpp>
@@ -1949,20 +1948,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto offset_length_it =
         thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin());
 
-      // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference
-      auto string_ranges_it =
-        thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) {
-          return thrust::pair<json_column::row_offset_t, std::size_t>{
-            thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
-      // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
-      auto string_spans_it = thrust::make_transform_iterator(
-        offset_length_it, [data = d_input.data()] __device__(auto ip) {
-          return thrust::pair<char const*, std::size_t>{
-            data + thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
-        });
-
       data_type target_type{};
 
       if (schema.has_value()) {
@@ -1978,7 +1963,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
         target_type =
           cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(),
                                             d_input,
-                                            string_ranges_it,
+                                            offset_length_it,
                                             col_size,
                                             stream);
       }
@@ -1986,7 +1971,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       auto [result_bitmask, null_count] = make_validity(json_col);
 
       // Convert strings to the inferred data type
-      auto col = parse_data(string_spans_it,
+      auto col = parse_data(d_input.data(),
+                            offset_length_it,
                             col_size,
                             target_type,
                             std::move(result_bitmask),
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 1e44522ed33..2d363c51fce 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -20,6 +20,7 @@
  */
 
 #include <io/csv/durations.hpp>
+#include <io/utilities/parsing_utils.cuh>
 #include <lists/utilities.hpp>
 
 #include <cudf/column/column_device_view.cuh>
@@ -27,9 +28,9 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
new file mode 100644
index 00000000000..1772e5e43fa
--- /dev/null
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -0,0 +1,987 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/string_parsing.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utf8.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/functional.h>
+#include <thrust/transform_reduce.h>
+
+#include <cub/cub.cuh>
+
+#include <memory>
+#include <type_traits>
+
+namespace cudf::io::json::detail {
+
+constexpr auto SINGLE_THREAD_THRESHOLD = 128;
+constexpr auto WARP_THRESHOLD          = 128 * 128;  // 16K
+
+// Unicode code point escape sequence
+static constexpr char UNICODE_SEQ = 0x7F;
+
+// Invalid escape sequence
+static constexpr char NON_ESCAPE_CHAR = 0x7E;
+
+// Unicode code point escape sequence prefix comprises '\' and 'u' characters
+static constexpr size_type UNICODE_ESC_PREFIX = 2;
+
+// Unicode code point escape sequence comprises four hex characters
+static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4;
+
+// A unicode code point escape sequence is \uXXXX
+static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT;
+
+static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800;
+static constexpr auto UTF16_HIGH_SURROGATE_END   = 0xDC00;
+static constexpr auto UTF16_LOW_SURROGATE_BEGIN  = 0xDC00;
+static constexpr auto UTF16_LOW_SURROGATE_END    = 0xE000;
+
+/**
+ * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or
+ * whether type casting failed.
+ */
+enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE };
+
+/**
+ * @brief Providing additional information about the type casting result.
+ */
+struct data_casting_result_info {
+  // Number of bytes written to output
+  size_type bytes;
+  // Whether parsing succeeded, item was parsed to null, or failed
+  data_casting_result result;
+};
+
+/**
+ * @brief Returns the character to output for a given escaped character that's following a
+ * backslash.
+ *
+ * @param escaped_char The character following the backslash.
+ * @return The character to output for a given character that's following a backslash
+ */
+__device__ __forceinline__ char get_escape_char(char escaped_char)
+{
+  switch (escaped_char) {
+    case '"': return '"';
+    case '\\': return '\\';
+    case '/': return '/';
+    case 'b': return '\b';
+    case 'f': return '\f';
+    case 'n': return '\n';
+    case 'r': return '\r';
+    case 't': return '\t';
+    case 'u': return UNICODE_SEQ;
+    default: return NON_ESCAPE_CHAR;
+  }
+}
+
+/**
+ * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence
+ * \uXXXX.
+ *
+ * @param str Pointer to the first (most-significant) hex digit
+ * @return The parsed hex value if successful, -1 otherwise.
+ */
+__device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
+{
+  // Prepare result
+  int32_t result = 0, base = 1;
+  constexpr int32_t hex_radix = 16;
+
+  // Iterate over hex digits right-to-left
+  size_type index = UNICODE_HEX_DIGIT_COUNT;
+  while (index-- > 0) {
+    char const ch = str[index];
+    if (ch >= '0' && ch <= '9') {
+      result += static_cast<int32_t>((ch - '0') + 0) * base;
+      base *= hex_radix;
+    } else if (ch >= 'A' && ch <= 'F') {
+      result += static_cast<int32_t>((ch - 'A') + 10) * base;
+      base *= hex_radix;
+    } else if (ch >= 'a' && ch <= 'f') {
+      result += static_cast<int32_t>((ch - 'a') + 10) * base;
+      base *= hex_radix;
+    } else {
+      return -1;
+    }
+  }
+  return result;
+}
+
+/**
+ * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
+ * \p out_it
+ */
+constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
+{
+  auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character)
+                                         : strings::detail::from_char_utf8(character, out_it);
+  if (out_it) out_it += bytes;
+  return bytes;
+}
+
+/**
+ * @brief Processes a string, replaces escape sequences and optionally strips off the quote
+ * characters.
+ *
+ * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to
+ * char
+ * @param in_begin Iterator to the first item to process
+ * @param in_end Iterator to one past the last item to process
+ * @param d_buffer Output character buffer to the first item to write
+ * @param options Settings for controlling string processing behavior
+ * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is
+ * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether
+ * parsing succeeded, item was parsed to null, or failed.
+ */
+template <typename in_iterator_t>
+__device__ __forceinline__ data_casting_result_info
+process_string(in_iterator_t in_begin,
+               in_iterator_t in_end,
+               char* d_buffer,
+               cudf::io::parse_options_view const& options)
+{
+  int32_t bytes           = 0;
+  auto const num_in_chars = thrust::distance(in_begin, in_end);
+  // String values are indicated by keeping the quote character
+  bool const is_string_value =
+    num_in_chars >= 2LL &&
+    (options.quotechar == '\0' ||
+     (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
+
+  // Copy literal/numeric value
+  if (not is_string_value) {
+    bytes += (in_end - in_begin);
+    if (d_buffer) d_buffer = thrust::copy(thrust::seq, in_begin, in_end, d_buffer);
+    return {bytes, data_casting_result::PARSING_SUCCESS};
+  }
+  char constexpr backslash_char = '\\';
+
+  // Escape-flag, set after encountering a backslash character
+  bool is_prev_char_escape = false;
+
+  // Exclude beginning and ending quote chars from string range
+  if (!options.keepquotes) {
+    ++in_begin;
+    --in_end;
+  }
+
+  // Iterate over the input
+  while (in_begin != in_end) {
+    // Copy single character to output
+    if (!is_prev_char_escape) {
+      is_prev_char_escape = (*in_begin == backslash_char);
+      if (!is_prev_char_escape) {
+        if (d_buffer) *d_buffer++ = *in_begin;
+        ++bytes;
+      }
+      ++in_begin;
+      continue;
+    }
+
+    // Previous char indicated beginning of escape sequence
+    // Reset escape flag for next loop iteration
+    is_prev_char_escape = false;
+
+    // Check the character that is supposed to be escaped
+    auto escaped_char = get_escape_char(*in_begin);
+
+    // We escaped an invalid escape character -> "fail"/null for this item
+    if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; }
+
+    // Regular, single-character escape
+    if (escaped_char != UNICODE_SEQ) {
+      if (d_buffer) *d_buffer++ = escaped_char;
+      ++bytes;
+      ++in_begin;
+      continue;
+    }
+
+    // This is an escape sequence of a unicode code point: \uXXXX,
+    // where each X in XXXX represents a hex digit
+    // Skip over the 'u' char from \uXXXX to the first hex digit
+    ++in_begin;
+
+    // Make sure that there's at least 4 characters left from the
+    // input, which are expected to be hex digits
+    if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) {
+      return {bytes, data_casting_result::PARSING_FAILURE};
+    }
+
+    auto hex_val = parse_unicode_hex(in_begin);
+
+    // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item
+    if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; }
+
+    // Skip over the four hex digits
+    thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT);
+
+    // If this may be a UTF-16 encoded surrogate pair:
+    // we expect another \uXXXX sequence
+    int32_t hex_low_val = 0;
+    if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+        thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS &&
+        *in_begin == backslash_char && *thrust::next(in_begin) == 'u') {
+      // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low
+      // surrogate
+      hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2));
+    }
+
+    // This is indeed a UTF16 surrogate pair
+    if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+        hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
+      // Skip over the second \uXXXX sequence
+      thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS);
+
+      // Compute UTF16-encoded code point
+      uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
+                                    (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
+      auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point);
+      bytes += write_utf8_char(utf8_chars, d_buffer);
+    } else {
+      // Just a single \uXXXX sequence
+      auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val);
+      bytes += write_utf8_char(utf8_chars, d_buffer);
+    }
+  }
+
+  // The last character of the input is a backslash -> "fail"/null for this item
+  if (is_prev_char_escape) { return {bytes, data_casting_result::PARSING_FAILURE}; }
+  return {bytes, data_casting_result::PARSING_SUCCESS};
+}
+
+/**
+ * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
+ * warp.
+ *
+ * @tparam num_warps number of warps in the block
+ */
+template <unsigned num_warps>
+struct bitfield_warp {
+  static constexpr auto UNICODE_LOOK_BACK{5};
+  // 5 because for skipping unicode hex chars, look back up to 5 chars are needed.
+  // 5+32 for each warp.
+  bool is_slash[num_warps][UNICODE_LOOK_BACK + cudf::detail::warp_size];
+
+  /// Sets all bits to 0
+  __device__ void reset(unsigned warp_id)
+  {
+    if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK) {
+      is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] = 0;
+    }
+    is_slash[warp_id][threadIdx.x % cudf::detail::warp_size + UNICODE_LOOK_BACK] = 0;
+  }
+
+  /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits
+  __device__ void shift(unsigned warp_id)
+  {
+    if (threadIdx.x % cudf::detail::warp_size < UNICODE_LOOK_BACK)
+      is_slash[warp_id][threadIdx.x % cudf::detail::warp_size] =
+        is_slash[warp_id][cudf::detail::warp_size + threadIdx.x % cudf::detail::warp_size];
+    __syncwarp();
+  }
+
+  /// Each thread in a warp sets its own bit.
+  __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
+  {
+    is_slash[warp_id][UNICODE_LOOK_BACK + threadIdx.x % cudf::detail::warp_size] =
+      is_escaping_backslash;
+    __syncwarp();
+  }
+
+  /// Each thread in a warp gets the requested bit.
+  __device__ bool get_bit(unsigned warp_id, int bit_index)
+  {
+    return is_slash[warp_id][UNICODE_LOOK_BACK + bit_index];
+  }
+};
+
+/**
+ * @brief Data structure to hold 1 bit per thread with previous `UNICODE_LOOK_BACK` bits stored in a
+ * block.
+ *
+ * @tparam num_warps number of warps in the block
+ */
+template <unsigned num_warps>
+struct bitfield_block {
+  static constexpr auto UNICODE_LOOK_BACK{5};
+  // 5 because for skipping unicode hex chars, look back up to 5 chars are needed.
+  // 5 + num_warps*32 for entire block
+  bool is_slash[UNICODE_LOOK_BACK + num_warps * cudf::detail::warp_size];
+
+  /// Sets all bits to 0
+  __device__ void reset(unsigned warp_id)
+  {
+    if (threadIdx.x < UNICODE_LOOK_BACK) { is_slash[threadIdx.x] = 0; }
+    is_slash[threadIdx.x + UNICODE_LOOK_BACK] = 0;
+  }
+
+  /// Shifts UNICODE_LOOK_BACK bits to the left to hold the previous UNICODE_LOOK_BACK bits
+  __device__ void shift(unsigned warp_id)
+  {
+    if (threadIdx.x < UNICODE_LOOK_BACK)
+      is_slash[threadIdx.x] = is_slash[num_warps * cudf::detail::warp_size + threadIdx.x];
+    __syncthreads();
+  }
+
+  /// Each thread in a block sets its own bit.
+  __device__ void set_bits(unsigned warp_id, bool is_escaping_backslash)
+  {
+    is_slash[UNICODE_LOOK_BACK + threadIdx.x] = is_escaping_backslash;
+    __syncthreads();
+  }
+
+  /// Each thread in a block gets the requested bit.
+  __device__ bool get_bit(unsigned warp_id, int bit_index)
+  {
+    return is_slash[UNICODE_LOOK_BACK + bit_index];
+  }
+};
+
+// Algorithm: warp/block parallel version of string_parse and process_string()
+// Decoding character classes (u8, u16, \*, *):
+// character      count: input->output
+// \uXXXX         6->2/3/4
+// \uXXXX\uXXXX  12->2/3/4
+// \"             2->1
+// *              1->1
+//
+// ERROR conditions. (all collaborating threads quit)
+// c=='\' & curr_idx == end_idx-1;
+// [c-1]=='\' &  get_escape[c]==NEC
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx < UNICODE_HEX_DIGIT_COUNT
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && non-hex
+//
+// skip conditions. (current thread skips this char, no output)
+// c=='\' skip. (Escaping char only)
+// [c-2]=='\' && [c-1]=='u' for [2,1], [3,2] [4,5], [5, 6], skip.
+//
+// write conditions. (write to d_buffer)
+// [c-1]!='\' &  [c]!='\' write [c]
+// [c-1]!='\' &  [c]=='\' skip (already covered in skip conditions)
+// [c-1]=='\' &  [c]!=NEC && [c]!=UNICODE_SEQ, write [c]
+// [c-1]=='\' &  [c]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT && hex, DECODE
+// [c+1:4]=curr_hex_val
+//        // if [c+5]=='\' & [c+6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
+//        hex,DECODE [c+7:4]=next_hex_val
+//        // if [c-7]=='\' & [c-6]=='u' & end_idx-curr_idx >= UNICODE_HEX_DIGIT_COUNT &&
+//        hex,DECODE [c-5:4]=prev_hex_val prev_hex_val, curr_hex_val, next_hex_val
+//        // if prev_hex_val in high, curr_hex_val in low, skip.
+//        // if curr_hex_val in high, next_hex_val in low, write [u16]
+// if curr_hex_val not in high, write [u8]
+// before writing, find num of output characters per threads,
+// then do intra-warp/intra-block scan for out_idx
+// propagate offset from next iteration to carry forward.
+// Uses 1 warp per string or 1 block per string
+
+/**
+ * @brief Warp/Block parallel version of string_parse functor
+ *
+ * @tparam is_warp True if 1 warp per string, False if 1 block per string
+ * @tparam num_warps Number of warps per block
+ * @tparam str_tuple_it Iterator type for tuple with string pointer and its length
+ * @param str_tuples iterator of tuple with string pointer and its length
+ * @param total_out_strings Number of string rows to be processed
+ * @param str_counter Counter to keep track of processed number of strings
+ * @param null_mask Null mask
+ * @param null_count_data pointer to store null count
+ * @param options Settings for controlling string processing behavior
+ * @param d_offsets Offsets to identify where to store the results for each string
+ * @param d_chars Character array to store the characters of strings
+ */
+template <bool is_warp, size_type num_warps, typename str_tuple_it>
+__global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
+                                         size_type total_out_strings,
+                                         size_type* str_counter,
+                                         bitmask_type* null_mask,
+                                         size_type* null_count_data,
+                                         cudf::io::parse_options_view const options,
+                                         size_type* d_offsets,
+                                         char* d_chars)
+{
+  constexpr auto BLOCK_SIZE =
+    is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps;
+  size_type lane = is_warp ? (threadIdx.x % BLOCK_SIZE) : threadIdx.x;
+
+  // get 1-string index per warp/block
+  auto get_next_string = [&]() {
+    if constexpr (is_warp) {
+      size_type istring;
+      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+      return __shfl_sync(0xffffffff, istring, 0);
+    } else {
+      // Ensure lane 0 doesn't update istring before all threads have read the previous iteration's
+      // istring value
+      __syncthreads();
+      __shared__ size_type istring;
+      if (lane == 0) { istring = atomicAdd(str_counter, 1); }
+      __syncthreads();
+      return istring;
+    }
+  };
+  // grid-stride loop.
+  for (size_type istring = get_next_string(); istring < total_out_strings;
+       istring           = get_next_string()) {
+    // skip nulls
+    if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
+      if (!d_chars && lane == 0) d_offsets[istring] = 0;
+      continue;  // gride-stride return;
+    }
+
+    auto in_begin           = str_tuples[istring].first;
+    auto in_end             = in_begin + str_tuples[istring].second;
+    auto const num_in_chars = str_tuples[istring].second;
+    if constexpr (is_warp) {
+      if (num_in_chars <= SINGLE_THREAD_THRESHOLD or num_in_chars > WARP_THRESHOLD) continue;
+    } else {
+      if (num_in_chars <= WARP_THRESHOLD) continue;
+    }
+
+    // Check if the value corresponds to the null literal
+    if (!d_chars) {
+      auto const is_null_literal = serialized_trie_contains(
+        options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+      if (is_null_literal && null_mask != nullptr) {
+        if (lane == 0) {
+          clear_bit(null_mask, istring);
+          atomicAdd(null_count_data, 1);
+          if (!d_chars) d_offsets[istring] = 0;
+        }
+        continue;  // gride-stride return;
+      }
+    }
+    // String values are indicated by keeping the quote character
+    bool const is_string_value =
+      num_in_chars >= 2LL &&
+      (options.quotechar == '\0' ||
+       (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));
+    char* d_buffer = d_chars ? d_chars + d_offsets[istring] : nullptr;
+
+    // Copy literal/numeric value
+    if (not is_string_value) {
+      if (!d_chars) {
+        if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
+      } else {
+        for (thread_index_type char_index = lane; char_index < (in_end - in_begin);
+             char_index += BLOCK_SIZE) {
+          d_buffer[char_index] = in_begin[char_index];
+        }
+      }
+      continue;  // gride-stride return;
+    }
+
+    // Exclude beginning and ending quote chars from string range
+    if (!options.keepquotes) {
+      ++in_begin;
+      --in_end;
+    }
+    // warp-parallelized or block-parallelized process_string()
+
+    auto is_hex = [](auto ch) {
+      return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
+    };
+
+    // for backslash scan calculation: is_previous_escaping_backslash
+    [[maybe_unused]] auto warp_id = threadIdx.x / cudf::detail::warp_size;
+    bool init_state_reg;
+    __shared__ bool init_state_shared;
+    size_type last_offset_reg;
+    __shared__ size_type last_offset_shared;
+    bool& init_state(is_warp ? init_state_reg : init_state_shared);
+    size_type& last_offset(is_warp ? last_offset_reg : last_offset_shared);
+    if (is_warp || lane == 0) {
+      init_state  = false;
+      last_offset = 0;
+    }
+    using bitfield =
+      std::conditional_t<is_warp, bitfield_warp<num_warps>, bitfield_block<num_warps>>;
+    __shared__ bitfield is_slash;
+    is_slash.reset(warp_id);
+    __syncthreads();
+    // 0-31, 32-63, ... i*32-n.
+    // entire warp executes but with mask.
+    for (thread_index_type char_index = lane;
+         char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
+         char_index += BLOCK_SIZE) {
+      bool const is_within_bounds = char_index < (in_end - in_begin);
+      auto const MASK   = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff;
+      auto const c      = is_within_bounds ? in_begin[char_index] : '\0';
+      auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
+      auto const escaped_char = get_escape_char(c);
+
+      bool is_escaping_backslash{false};
+      [[maybe_unused]] bool is_prev_escaping_backslash{false};
+      // To check current is backslash by checking if previous is backslash.
+      // curr = !prev & c=='\\'
+      // So, scan is required from beginning of string.
+      // State table approach (intra-warp FST) (intra-block FST)
+      // 2 states: Not-Slash(NS), Slash(S).
+      // prev  /   *
+      // NS    S  NS
+      //  S   NS  NS
+      // After inclusive scan, all current S states translate to escaping backslash.
+      // All escaping backslash should be skipped.
+
+      struct state_table {
+        // using bit fields instead of state[2]
+        bool state0 : 1;
+        bool state1 : 1;
+        bool inline __device__ get(bool init_state) const { return init_state ? state1 : state0; }
+      };
+      state_table curr{is_within_bounds && c == '\\', false};  // state transition vector.
+      auto composite_op = [](state_table op1, state_table op2) {
+        // equivalent of state_table{op2.state[op1.state[0]], op2.state[op1.state[1]]};
+        return state_table{op1.state0 ? op2.state1 : op2.state0,
+                           op1.state1 ? op2.state1 : op2.state0};
+      };
+      state_table scanned;
+      // inclusive scan of escaping backslashes
+      if constexpr (is_warp) {
+        using SlashScan = cub::WarpScan<state_table>;
+        __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
+        SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
+        is_escaping_backslash = scanned.get(init_state);
+        init_state            = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
+        __syncwarp();
+        is_slash.shift(warp_id);
+        is_slash.set_bits(warp_id, is_escaping_backslash);
+        is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1);
+      } else {
+        using SlashScan = cub::BlockScan<state_table, BLOCK_SIZE>;
+        __shared__ typename SlashScan::TempStorage temp_slash;
+        SlashScan(temp_slash).InclusiveScan(curr, scanned, composite_op);
+        is_escaping_backslash = scanned.get(init_state);
+        __syncthreads();
+        if (threadIdx.x == BLOCK_SIZE - 1) init_state = is_escaping_backslash;
+        __syncthreads();
+        is_slash.shift(warp_id);
+        is_slash.set_bits(warp_id, is_escaping_backslash);
+        is_prev_escaping_backslash = is_slash.get_bit(warp_id, lane - 1);
+        // There is another __syncthreads() at the end of for-loop.
+      }
+
+      // String with parsing errors are made as null
+      bool error = false;
+      if (is_within_bounds) {
+        // curr=='\' and end, or prev=='\' and curr=='u' and end-curr < UNICODE_HEX_DIGIT_COUNT
+        // or prev=='\' and curr=='u' and end-curr >= UNICODE_HEX_DIGIT_COUNT and any non-hex
+        error |= (is_escaping_backslash /*c == '\\'*/ && char_index == (in_end - in_begin) - 1);
+        error |= (is_prev_escaping_backslash && escaped_char == NON_ESCAPE_CHAR);
+        error |= (is_prev_escaping_backslash && c == 'u' &&
+                  ((in_begin + char_index + UNICODE_HEX_DIGIT_COUNT >= in_end) |
+                   !is_hex(in_begin[char_index + 1]) | !is_hex(in_begin[char_index + 2]) |
+                   !is_hex(in_begin[char_index + 3]) | !is_hex(in_begin[char_index + 4])));
+      }
+      // Make sure all threads have no errors before continuing
+      if constexpr (is_warp) {
+        error = __any_sync(MASK, error);
+      } else {
+        using ErrorReduce = cub::BlockReduce<bool, BLOCK_SIZE>;
+        __shared__ typename ErrorReduce::TempStorage temp_storage_error;
+        __shared__ bool error_reduced;
+        error_reduced = ErrorReduce(temp_storage_error).Sum(error);  // TODO use cub::LogicalOR.
+        // only valid in thread0, so shared memory is used for broadcast.
+        __syncthreads();
+        error = error_reduced;
+      }
+      // If any thread has an error, skip the rest of the string and make this string as null
+      if (error) {
+        if (!d_chars && lane == 0) {
+          if (null_mask != nullptr) {
+            clear_bit(null_mask, istring);
+            atomicAdd(null_count_data, 1);
+          }
+          last_offset        = 0;
+          d_offsets[istring] = 0;
+        }
+        if constexpr (!is_warp) { __syncthreads(); }
+        break;  // gride-stride return;
+      }
+
+      // Skipping non-copied escaped characters
+      bool skip = !is_within_bounds;  // false;
+      // skip \ for \" \\ \/ \b \f \n \r \t \uXXXX
+      skip |= is_escaping_backslash;
+      if (is_within_bounds) {
+        // skip X for each X in \uXXXX
+        skip |=
+          char_index >= 2 && is_slash.get_bit(warp_id, lane - 2) && in_begin[char_index - 1] == 'u';
+        skip |=
+          char_index >= 3 && is_slash.get_bit(warp_id, lane - 3) && in_begin[char_index - 2] == 'u';
+        skip |=
+          char_index >= 4 && is_slash.get_bit(warp_id, lane - 4) && in_begin[char_index - 3] == 'u';
+        skip |=
+          char_index >= 5 && is_slash.get_bit(warp_id, lane - 5) && in_begin[char_index - 4] == 'u';
+      }
+      int this_num_out = 0;
+      cudf::char_utf8 write_char{};
+
+      if (!skip) {
+        // 1. Unescaped character
+        if (!is_prev_escaping_backslash) {
+          this_num_out = 1;
+          // writes char directly for non-unicode
+        } else {
+          // 2. Escaped character
+          if (escaped_char != UNICODE_SEQ) {
+            this_num_out = 1;
+            // writes char directly for non-unicode
+          } else {
+            // 3. Unicode
+            // UTF8 \uXXXX
+            auto hex_val     = parse_unicode_hex(in_begin + char_index + 1);
+            auto hex_low_val = 0;
+            // UTF16 \uXXXX\uXXXX
+            // Note: no need for scanned_backslash below because we already know that
+            // only '\u' check is enough.
+            if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+                (in_begin + char_index + UNICODE_HEX_DIGIT_COUNT + NUM_UNICODE_ESC_SEQ_CHARS) <
+                  in_end &&
+                in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS - 1] == '\\' &&
+                in_begin[char_index + NUM_UNICODE_ESC_SEQ_CHARS] == 'u') {
+              hex_low_val = parse_unicode_hex(in_begin + char_index + 1 + 6);
+            }
+            if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END &&
+                hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) {
+              // Compute UTF16-encoded code point
+              uint32_t unicode_code_point = 0x10000 +
+                                            ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
+                                            (hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
+              write_char   = strings::detail::codepoint_to_utf8(unicode_code_point);
+              this_num_out = strings::detail::bytes_in_char_utf8(write_char);
+            } else {
+              // if hex_val is high surrogate, ideally it should be parsing failure.
+              // but skipping it as other parsers do this too.
+              if (hex_val >= UTF16_LOW_SURROGATE_BEGIN && hex_val < UTF16_LOW_SURROGATE_END) {
+                // Ideally this should be skipped if previous char is high surrogate.
+                skip         = true;
+                this_num_out = 0;
+                write_char   = 0;
+              } else {
+                // if UTF8
+                write_char   = strings::detail::codepoint_to_utf8(hex_val);
+                this_num_out = strings::detail::bytes_in_char_utf8(write_char);
+              }
+            }
+          }
+        }
+      }  // !skip end.
+      {
+        // compute offset to write output for each thread
+        size_type offset;
+        if constexpr (is_warp) {
+          using OffsetScan = cub::WarpScan<size_type>;
+          __shared__ typename OffsetScan::TempStorage temp_storage[num_warps];
+          OffsetScan(temp_storage[warp_id]).ExclusiveSum(this_num_out, offset);
+        } else {
+          using OffsetScan = cub::BlockScan<size_type, BLOCK_SIZE>;
+          __shared__ typename OffsetScan::TempStorage temp_storage;
+          OffsetScan(temp_storage).ExclusiveSum(this_num_out, offset);
+          __syncthreads();
+        }
+        offset += last_offset;
+        // Write output
+        if (d_chars && !skip) {
+          auto const is_not_unicode = (!is_prev_escaping_backslash) || escaped_char != UNICODE_SEQ;
+          if (is_not_unicode) {
+            *(d_buffer + offset) = (!is_prev_escaping_backslash) ? c : escaped_char;
+          } else {
+            strings::detail::from_char_utf8(write_char, d_buffer + offset);
+          }
+        }
+        offset += this_num_out;
+        if constexpr (is_warp) {
+          last_offset = __shfl_sync(0xffffffff, offset, BLOCK_SIZE - 1);
+        } else {
+          __syncthreads();
+          if (threadIdx.x == BLOCK_SIZE - 1) last_offset = offset;
+          __syncthreads();
+        }
+      }
+    }  // char for-loop
+    if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
+  }  // grid-stride for-loop
+}
+
+template <typename str_tuple_it>
+struct string_parse {
+  str_tuple_it str_tuples;
+  bitmask_type* null_mask;
+  size_type* null_count_data;
+  cudf::io::parse_options_view const options;
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  __device__ void operator()(size_type idx)
+  {
+    if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const in_begin     = str_tuples[idx].first;
+    auto const in_end       = in_begin + str_tuples[idx].second;
+    auto const num_in_chars = str_tuples[idx].second;
+
+    if (num_in_chars > SINGLE_THREAD_THRESHOLD) return;
+
+    // Check if the value corresponds to the null literal
+    if (!d_chars) {
+      auto const is_null_literal = serialized_trie_contains(
+        options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
+      if (is_null_literal && null_mask != nullptr) {
+        clear_bit(null_mask, idx);
+        atomicAdd(null_count_data, 1);
+        if (!d_chars) d_offsets[idx] = 0;
+        return;
+      }
+    }
+
+    char* d_buffer        = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
+    if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
+      if (null_mask != nullptr) {
+        clear_bit(null_mask, idx);
+        atomicAdd(null_count_data, 1);
+      }
+      if (!d_chars) d_offsets[idx] = 0;
+    } else {
+      if (!d_chars) d_offsets[idx] = str_process_info.bytes;
+    }
+  }
+};
+
+template <typename SymbolT>
+struct to_string_view_pair {
+  SymbolT const* data;
+  to_string_view_pair(SymbolT const* _data) : data(_data) {}
+  __device__ auto operator()(thrust::tuple<size_type, size_type> ip)
+  {
+    return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
+                                                  static_cast<std::size_t>(thrust::get<1>(ip))};
+  }
+};
+
+template <typename string_view_pair_it>
+static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
+                                            size_type col_size,
+                                            rmm::device_buffer&& null_mask,
+                                            rmm::device_scalar<size_type>& d_null_count,
+                                            cudf::io::parse_options_view const& options,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  //  CUDF_FUNC_RANGE();
+
+  auto const max_length = thrust::transform_reduce(
+    rmm::exec_policy(stream),
+    str_tuples,
+    str_tuples + col_size,
+    [] __device__(auto t) { return t.second; },
+    size_type{0},
+    thrust::maximum<size_type>{});
+
+  auto offsets = cudf::make_numeric_column(
+    data_type{type_to_id<size_type>()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets       = offsets->mutable_view().data<size_type>();
+  auto null_count_data = d_null_count.data();
+
+  auto single_thread_fn = string_parse<decltype(str_tuples)>{
+    str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_offsets};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     col_size,
+                     single_thread_fn);
+
+  constexpr auto warps_per_block  = 8;
+  constexpr int threads_per_block = cudf::detail::warp_size * warps_per_block;
+  auto num_blocks                 = cudf::util::div_rounding_up_safe(col_size, warps_per_block);
+  auto str_counter                = cudf::numeric_scalar(size_type{0}, true, stream);
+
+  // TODO run these independent kernels in parallel streams.
+  if (max_length > SINGLE_THREAD_THRESHOLD) {
+    parse_fn_string_parallel<true, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        nullptr);
+  }
+
+  if (max_length > WARP_THRESHOLD) {
+    // for strings longer than WARP_THRESHOLD, 1 block per string
+    str_counter.set_value(0, stream);
+    parse_fn_string_parallel<false, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        nullptr);
+  }
+  auto const bytes =
+    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
+  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
+               std::overflow_error);
+
+  // CHARS column
+  std::unique_ptr<column> chars =
+    strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+  auto d_chars = chars->mutable_view().data<char>();
+
+  single_thread_fn.d_chars = d_chars;
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     col_size,
+                     single_thread_fn);
+
+  if (max_length > SINGLE_THREAD_THRESHOLD) {
+    str_counter.set_value(0, stream);
+    parse_fn_string_parallel<true, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        d_chars);
+  }
+
+  if (max_length > WARP_THRESHOLD) {
+    str_counter.set_value(0, stream);
+    // for strings longer than WARP_THRESHOLD, 1 block per string
+    parse_fn_string_parallel<false, warps_per_block>
+      <<<num_blocks, threads_per_block, 0, stream.value()>>>(
+        str_tuples,
+        col_size,
+        str_counter.data(),
+        static_cast<bitmask_type*>(null_mask.data()),
+        null_count_data,
+        options,
+        d_offsets,
+        d_chars);
+  }
+
+  return make_strings_column(col_size,
+                             std::move(offsets),
+                             std::move(chars),
+                             d_null_count.value(stream),
+                             std::move(null_mask));
+}
+
+std::unique_ptr<column> parse_data(
+  const char* data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  size_type col_size,
+  data_type col_type,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  cudf::io::parse_options_view const& options,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  if (col_size == 0) { return make_empty_column(col_type); }
+  auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
+  auto null_count_data = d_null_count.data();
+
+  // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
+  auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});
+
+  if (col_type == cudf::data_type{cudf::type_id::STRING}) {
+    return parse_string(str_tuples,
+                        col_size,
+                        std::forward<rmm::device_buffer>(null_mask),
+                        d_null_count,
+                        options,
+                        stream,
+                        mr);
+  }
+
+  auto out_col =
+    make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr);
+  auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream);
+
+  // use `ConvertFunctor` to convert non-string values
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    col_size,
+    [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__(
+      size_type row) {
+      if (col.is_null(row)) { return; }
+      auto const in = str_tuples[row];
+
+      auto const is_null_literal =
+        serialized_trie_contains(options.trie_na, {in.first, static_cast<size_t>(in.second)});
+
+      if (is_null_literal) {
+        col.set_null(row);
+        atomicAdd(null_count_data, 1);
+        return;
+      }
+
+      // If this is a string value, remove quotes
+      auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar);
+
+      auto const is_parsed = cudf::type_dispatcher(col_type,
+                                                   ConvertFunctor{},
+                                                   in_begin,
+                                                   in_end,
+                                                   col.data<char>(),
+                                                   row,
+                                                   col_type,
+                                                   options,
+                                                   false);
+      if (not is_parsed) {
+        col.set_null(row);
+        atomicAdd(null_count_data, 1);
+      }
+    });
+
+  out_col->set_null_count(d_null_count.value(stream));
+
+  return out_col;
+}
+
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 5c3af588411..43d62fcd513 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,6 +116,28 @@ struct parse_options {
   }
 };
 
+/**
+ * @brief Returns the escaped characters for a given character.
+ *
+ * @param escaped_char The character to escape.
+ * @return The escaped characters for a given character.
+ */
+__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
+{
+  switch (escaped_char) {
+    case '"': return {'\\', '"'};
+    case '\\': return {'\\', '\\'};
+    case '/': return {'\\', '/'};
+    case '\b': return {'\\', 'b'};
+    case '\f': return {'\\', 'f'};
+    case '\n': return {'\\', 'n'};
+    case '\r': return {'\\', 'r'};
+    case '\t': return {'\\', 't'};
+    // case 'u': return UNICODE_SEQ;
+    default: return {'\0', escaped_char};
+  }
+}
+
 /**
  * @brief Returns the numeric value of an ASCII/UTF-8 character.
  * Handles hexadecimal digits, both uppercase and lowercase
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
new file mode 100644
index 00000000000..12fc0a5b2e7
--- /dev/null
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <io/utilities/parsing_utils.cuh>
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+namespace cudf::io {
+namespace detail {
+
+/**
+ * @brief Infers data type for a given JSON string input `data`.
+ *
+ * @throw cudf::logic_error if input size is 0
+ * @throw cudf::logic_error if date time is not inferred as string
+ * @throw cudf::logic_error if data type inference failed
+ *
+ * @param options View of inference options
+ * @param data JSON string input
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
+ * @param size Size of the string input
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The inferred data type
+ */
+cudf::data_type infer_data_type(
+  cudf::io::json_inference_options_view const& options,
+  device_span<char const> data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  std::size_t const size,
+  rmm::cuda_stream_view stream);
+}  // namespace detail
+
+namespace json::detail {
+
+/**
+ * @brief Parses the data from an iterator of string views, casting it to the given target data type
+ *
+ * @param data string input base pointer
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
+ * @param col_size The total number of items of this column
+ * @param col_type The column's target data type
+ * @param null_mask A null mask that renders certain items from the input invalid
+ * @param options Settings for controlling the processing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr The resource to be used for device memory allocation
+ * @return The column that contains the parsed data
+ */
+std::unique_ptr<column> parse_data(
+  const char* data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  size_type col_size,
+  data_type col_type,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  cudf::io::parse_options_view const& options,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+}  // namespace json::detail
+}  // namespace cudf::io
diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cu
similarity index 84%
rename from cpp/src/io/utilities/type_inference.cuh
rename to cpp/src/io/utilities/type_inference.cu
index a9ccc80ca33..79a5c8f1c4c 100644
--- a/cpp/src/io/utilities/type_inference.cuh
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,23 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
 
 #include <io/utilities/column_type_histogram.hpp>
-#include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/string_parsing.hpp>
 #include <io/utilities/trie.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
-#include <thrust/distance.h>
-#include <thrust/tuple.h>
-
 #include <cub/block/block_reduce.cuh>
 
 #include <cstddef>
@@ -114,14 +107,14 @@ __device__ __inline__ bool is_like_float(std::size_t len,
  *
  * @param[in] options View of inference options
  * @param[in] data JSON string input
- * @param[in] column_strings_begin The beginning of an offset-length tuple sequence
+ * @param[in] offset_length_begin The beginning of an offset-length tuple sequence
  * @param[in] size Size of the string input
  * @param[out] column_info Histogram of column type counters
  */
 template <int BlockSize, typename OptionsView, typename ColumnStringIter>
 __global__ void infer_column_type_kernel(OptionsView options,
                                          device_span<char const> data,
-                                         ColumnStringIter column_strings_begin,
+                                         ColumnStringIter offset_length_begin,
                                          std::size_t size,
                                          cudf::io::column_type_histogram* column_info)
 {
@@ -129,8 +122,8 @@ __global__ void infer_column_type_kernel(OptionsView options,
 
   for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size;
        idx += gridDim.x * blockDim.x) {
-    auto const field_offset = thrust::get<0>(*(column_strings_begin + idx));
-    auto const field_len    = thrust::get<1>(*(column_strings_begin + idx));
+    auto const field_offset = thrust::get<0>(*(offset_length_begin + idx));
+    auto const field_len    = thrust::get<1>(*(offset_length_begin + idx));
     auto const field_begin  = data.begin() + field_offset;
 
     if (cudf::detail::serialized_trie_contains(
@@ -234,7 +227,7 @@ __global__ void infer_column_type_kernel(OptionsView options,
  *
  * @param options View of inference options
  * @param data JSON string input
- * @param column_strings_begin The beginning of an offset-length tuple sequence
+ * @param offset_length_begin The beginning of an offset-length tuple sequence
  * @param size Size of the string input
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A histogram containing column-specific type counters
@@ -242,7 +235,7 @@ __global__ void infer_column_type_kernel(OptionsView options,
 template <typename OptionsView, typename ColumnStringIter>
 cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
                                                   cudf::device_span<char const> data,
-                                                  ColumnStringIter column_strings_begin,
+                                                  ColumnStringIter offset_length_begin,
                                                   std::size_t const size,
                                                   rmm::cuda_stream_view stream)
 {
@@ -254,40 +247,22 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
     d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value()));
 
   infer_column_type_kernel<block_size><<<grid_size, block_size, 0, stream.value()>>>(
-    options, data, column_strings_begin, size, d_column_info.data());
+    options, data, offset_length_begin, size, d_column_info.data());
 
   return d_column_info.value(stream);
 }
 
-/**
- * @brief Infers data type for a given JSON string input `data`.
- *
- * @throw cudf::logic_error if input size is 0
- * @throw cudf::logic_error if date time is not inferred as string
- * @throw cudf::logic_error if data type inference failed
- *
- * @tparam OptionsView Type of inference options view
- * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to
- * `thrust::tuple<device_span, string_view>`
- *
- * @param options View of inference options
- * @param data JSON string input
- * @param column_strings_begin The beginning of an offset-length tuple sequence
- * @param size Size of the string input
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return The inferred data type
- */
-template <typename OptionsView, typename ColumnStringIter>
-cudf::data_type infer_data_type(OptionsView const& options,
-                                device_span<char const> data,
-                                ColumnStringIter column_strings_begin,
-                                std::size_t const size,
-                                rmm::cuda_stream_view stream)
+cudf::data_type infer_data_type(
+  cudf::io::json_inference_options_view const& options,
+  device_span<char const> data,
+  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  std::size_t const size,
+  rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(size != 0, "No data available for data type inference.\n");
 
-  auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream);
+  auto const h_column_info = infer_column_type(options, data, offset_length_begin, size, stream);
 
   auto get_type_id = [&](auto const& cinfo) {
     auto int_count_total =
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 220f1a3391f..7c911ac2e04 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -26,6 +26,7 @@
 #include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
+#include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -1370,6 +1371,124 @@ TEST_F(JsonReaderTest, JsonExperimentalLines)
   CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view());
 }
 
+TEST_F(JsonReaderTest, JsonLongString)
+{
+  // Unicode
+  // 0000-FFFF     Basic Multilingual Plane
+  // 10000-10FFFF  Supplementary Plane
+  cudf::test::strings_column_wrapper col1{
+    {
+      "\"\\/\b\f\n\r\t",
+      "\"",
+      "\\",
+      "/",
+      "\b",
+      "\f\n",
+      "\r\t",
+      "$€",
+      "ராபிட்ஸ்",
+      "C𝞵𝓓𝒻",
+      "",  // null
+      "",  // null
+      "கார்த்தி",
+      "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
+      "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF
+      "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",                // 20000-2FFFF
+      "𰾑𱔈𲍉",                                          // 30000-3FFFF
+      R"("$€ \u0024\u20ac \\u0024\\u20ac  \\\u0024\\\u20ac \\\\u0024\\\\u20ac)",
+      R"(        \\\\\\\\\\\\\\\\)",
+      R"(\\\\\\\\\\\\\\\\)",
+      R"(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)",
+      R"( \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\)",
+      R"(                      \\abcd)",
+      R"(                 \\\\\\\\\\\\\\\\                 \\\\\\\\\\\\\\\\)",
+      R"(                \\\\\\\\\\\\\\\\                 \\\\\\\\\\\\\\\\)",
+    },
+    cudf::test::iterators::nulls_at({10, 11})};
+
+  cudf::test::fixed_width_column_wrapper<int16_t> repeat_times{
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 37, 81, 161, 323, 631, 1279, 10, 1, 2, 1, 100, 1000, 1, 3},
+    cudf::test::iterators::no_nulls()};
+  auto d_col2 = cudf::strings::repeat_strings(cudf::strings_column_view{col1}, repeat_times);
+  auto col2   = d_col2->view();
+  cudf::table_view const tbl_view{{col1, col2, repeat_times}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}};
+
+  std::vector<char> out_buffer;
+  auto destination     = cudf::io::sink_info(&out_buffer);
+  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
+                           .include_nulls(true)
+                           .metadata(mt)
+                           .lines(true)
+                           .na_rep("null");
+
+  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+
+  cudf::table_view const expected = tbl_view;
+  std::map<std::string, data_type> types;
+  types["col1"]  = data_type{type_id::STRING};
+  types["col2"]  = data_type{type_id::STRING};
+  types["int16"] = data_type{type_id::INT16};
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{out_buffer.data(), out_buffer.size()})
+      .lines(true)
+      .dtypes(types);
+
+  // Read test data via nested JSON reader
+  auto const table = cudf::io::read_json(json_lines_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, table.tbl->view());
+}
+
+TEST_F(JsonReaderTest, ErrorStrings)
+{
+  // cases of invalid escape characters, invalid unicode encodings.
+  // Error strings will decode to nulls
+  auto const buffer = std::string{R"(
+    {"col0": "\"\a"}
+    {"col0": "\u"}
+    {"col0": "\u0"}
+    {"col0": "\u0b"}
+    {"col0": "\u00b"}
+    {"col0": "\u00bz"}
+    {"col0": "\t34567890123456\t9012345678901\ug0bc"}
+    {"col0": "\t34567890123456\t90123456789012\u0hbc"}
+    {"col0": "\t34567890123456\t90123456789012\u00ic"}
+    {"col0": "\u0b95\u0bbe\u0bb0\u0bcd\u0ba4\u0bcd\u0ba4\u0bbfகார்த்தி"}
+)"};
+  // Last one is not an error case, but shows that unicode in json is copied string column output.
+
+  cudf::io::json_reader_options const in_opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .dtypes({data_type{cudf::type_id::STRING}})
+      .lines(true)
+      .legacy(false);
+
+  auto const result      = cudf::io::read_json(in_opts);
+  auto const result_view = result.tbl->view().column(0);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "col0");
+  EXPECT_EQ(result_view.null_count(), 9);
+  cudf::test::strings_column_wrapper expected{
+    {"",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "கார்த்தி\xe0\xae\x95\xe0\xae\xbe\xe0\xae\xb0\xe0\xaf\x8d\xe0\xae\xa4\xe0\xaf\x8d\xe0\xae\xa4"
+     "\xe0\xae\xbf"},
+    // unicode hex 0xe0 0xae 0x95 0xe0 0xae 0xbe 0xe0 0xae 0xb0 0xe0 0xaf 0x8d
+    //             0xe0 0xae 0xa4 0xe0 0xaf 0x8d 0xe0 0xae 0xa4 0xe0 0xae 0xbf
+    cudf::test::iterators::nulls_at({0, 1, 2, 3, 4, 5, 6, 7, 8})};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view, expected);
+}
+
 TEST_F(JsonReaderTest, TokenAllocation)
 {
   std::array<std::string const, 3> const json_inputs{
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index 5c32131114d..9eb5e8f5230 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -21,15 +21,20 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <io/utilities/string_parsing.hpp>
+
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/data_casting.cuh>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/exec_policy.hpp>
+
+#include <algorithm>
+#include <iterator>
 #include <type_traits>
 
 using namespace cudf::test::iterators;
@@ -37,13 +42,27 @@ using namespace cudf::test::iterators;
 struct JSONTypeCastTest : public cudf::test::BaseFixture {};
 
 namespace {
-struct to_thrust_pair_fn {
-  __device__ thrust::pair<char const*, cudf::size_type> operator()(
-    thrust::pair<cudf::string_view, bool> const& p)
+struct offsets_to_length {
+  __device__ cudf::size_type operator()(thrust::tuple<cudf::size_type, cudf::size_type> const& p)
   {
-    return {p.first.data(), p.first.size_bytes()};
+    return thrust::get<1>(p) - thrust::get<0>(p);
   }
 };
+
+/// Returns length of each string in the column
+auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream)
+{
+  auto offsets_begin = column.offsets_begin();
+  auto offsets_pair =
+    thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin)));
+  rmm::device_uvector<cudf::size_type> svs_length(column.size(), stream);
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
+                    offsets_pair,
+                    offsets_pair + column.size(),
+                    svs_length.begin(),
+                    offsets_to_length{});
+  return svs_length;
+}
 }  // namespace
 
 auto default_json_options()
@@ -67,26 +86,23 @@ TEST_F(JSONTypeCastTest, String)
   std::vector<char const*> input_values{"this", "is", "null", "of", "", "strings", R"("null")"};
   cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids);
 
-  auto d_column = cudf::column_device_view::create(input);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+  auto column                                     = cudf::strings_column_view(input);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto str_col = cudf::io::json::detail::parse_data(svs.data(),
-                                                    svs.size(),
-                                                    type,
-                                                    std::move(null_mask),
-                                                    0,
-                                                    default_json_options().view(),
-                                                    stream,
-                                                    mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto str_col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto out_valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; });
@@ -103,26 +119,23 @@ TEST_F(JSONTypeCastTest, Int)
   auto const type   = cudf::data_type{cudf::type_id::INT64};
 
   cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"});
-  auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+  auto column                                     = cudf::strings_column_view(data);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto col = cudf::io::json::detail::parse_data(svs.data(),
-                                                svs.size(),
-                                                type,
-                                                std::move(null_mask),
-                                                0,
-                                                default_json_options().view(),
-                                                stream,
-                                                mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto expected =
     cudf::test::fixed_width_column_wrapper<int64_t>{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}};
@@ -146,26 +159,23 @@ TEST_F(JSONTypeCastTest, StringEscapes)
     R"("escape with nothing to escape \")",
     R"("\"\\\/\b\f\n\r\t")",
   });
-  auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    d_column->pair_begin<cudf::string_view, false>(),
-                    d_column->pair_end<cudf::string_view, false>(),
-                    svs.begin(),
-                    to_thrust_pair_fn{});
+  auto column                                     = cudf::strings_column_view(data);
+  rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
 
   auto null_mask_it = no_nulls();
   auto null_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
-
-  auto col = cudf::io::json::detail::parse_data(svs.data(),
-                                                svs.size(),
-                                                type,
-                                                std::move(null_mask),
-                                                0,
-                                                default_json_options().view(),
-                                                stream,
-                                                mr);
+    std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+  auto col = cudf::io::json::detail::parse_data(
+    column.chars().data<char>(),
+    thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+    column.size(),
+    type,
+    std::move(null_mask),
+    0,
+    default_json_options().view(),
+    stream,
+    mr);
 
   auto expected = cudf::test::strings_column_wrapper{
     {"🚀", "Ａ🚀ＡＡ", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"},
@@ -173,4 +183,71 @@ TEST_F(JSONTypeCastTest, StringEscapes)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected);
 }
 
+TEST_F(JSONTypeCastTest, ErrorNulls)
+{
+  auto const stream = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+  auto const type   = cudf::data_type{cudf::type_id::STRING};
+
+  // error in decoding
+  std::vector<char const*> input_values{R"("\"\a")",
+                                        R"("\u")",
+                                        R"("\u0")",
+                                        R"("\u0b")",
+                                        R"("\u00b")",
+                                        R"("\u00bz")",
+                                        R"("\t34567890123456\t9012345678901\ug0bc")",
+                                        R"("\t34567890123456\t90123456789012\u0hbc")",
+                                        R"("\t34567890123456\t90123456789012\u00ic")",
+                                        R"("\t34567890123456\t9012345678901\")",
+                                        R"("\t34567890123456\t90123456789012\")",
+                                        R"(null)"};
+  // Note: without quotes are copied without decoding
+  cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end());
+
+  auto column        = cudf::strings_column_view(input);
+  auto space_length  = 128;
+  auto prepend_space = [&space_length](auto const& s) {
+    if (s[0] == '"') return "\"" + std::string(space_length, ' ') + std::string(s + 1);
+    return std::string(s);
+  };
+  std::vector<std::string> small_input;
+  std::transform(
+    input_values.begin(), input_values.end(), std::back_inserter(small_input), prepend_space);
+  cudf::test::strings_column_wrapper small_col(small_input.begin(), small_input.end());
+
+  std::vector<std::string> large_input;
+  space_length = 128 * 128;
+  std::transform(
+    input_values.begin(), input_values.end(), std::back_inserter(large_input), prepend_space);
+  cudf::test::strings_column_wrapper large_col(large_input.begin(), large_input.end());
+
+  std::vector<char const*> expected_values{"", "", "", "", "", "", "", "", "", "", "", ""};
+  cudf::test::strings_column_wrapper expected(
+    expected_values.begin(), expected_values.end(), cudf::test::iterators::all_nulls());
+
+  // single threads, warp, block.
+  for (auto const& column :
+       {column, cudf::strings_column_view(small_col), cudf::strings_column_view(large_col)}) {
+    rmm::device_uvector<cudf::size_type> svs_length = string_offset_to_length(column, stream);
+
+    auto null_mask_it = no_nulls();
+    auto null_mask =
+      std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
+
+    auto str_col = cudf::io::json::detail::parse_data(
+      column.chars().data<char>(),
+      thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
+      column.size(),
+      type,
+      std::move(null_mask),
+      0,
+      default_json_options().view(),
+      stream,
+      mr);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(str_col->view(), expected);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index b2eb1b94f9c..a14e7ecf5b3 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include <io/utilities/string_parsing.hpp>
 #include <io/utilities/trie.cuh>
-#include <io/utilities/type_inference.cuh>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -50,8 +50,8 @@ TEST_F(TypeInference, Basic)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 4, 7};
-  auto const string_length   = std::vector<std::size_t>{2, 2, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 4, 7};
+  auto const string_length   = std::vector<cudf::size_type>{2, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -83,8 +83,8 @@ TEST_F(TypeInference, Null)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 1, 4};
-  auto const string_length   = std::vector<std::size_t>{0, 2, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 1, 4};
+  auto const string_length   = std::vector<cudf::size_type>{0, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -116,8 +116,8 @@ TEST_F(TypeInference, AllNull)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 1, 1};
-  auto const string_length   = std::vector<std::size_t>{0, 0, 4};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 1, 1};
+  auto const string_length   = std::vector<cudf::size_type>{0, 0, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -149,8 +149,8 @@ TEST_F(TypeInference, String)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 8, 12};
-  auto const string_length   = std::vector<std::size_t>{6, 3, 4};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 8, 12};
+  auto const string_length   = std::vector<cudf::size_type>{6, 3, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -182,8 +182,8 @@ TEST_F(TypeInference, Bool)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 6, 12};
-  auto const string_length   = std::vector<std::size_t>{4, 5, 5};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 6, 12};
+  auto const string_length   = std::vector<cudf::size_type>{4, 5, 5};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -215,8 +215,8 @@ TEST_F(TypeInference, Timestamp)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 10};
-  auto const string_length   = std::vector<std::size_t>{8, 9};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 10};
+  auto const string_length   = std::vector<cudf::size_type>{8, 9};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
@@ -249,8 +249,8 @@ TEST_F(TypeInference, InvalidInput)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset   = std::vector<int32_t>{1, 3, 5, 7, 9};
-  auto const string_length   = std::vector<std::size_t>{1, 1, 1, 1, 1};
+  auto const string_offset   = std::vector<cudf::size_type>{1, 3, 5, 7, 9};
+  auto const string_length   = std::vector<cudf::size_type>{1, 1, 1, 1, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
     string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const d_string_length = cudf::detail::make_device_uvector_async(

From 63d197fe029ff2b57f4e0c7ab975bb35f844fc25 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 19 Sep 2023 19:27:10 -0700
Subject: [PATCH 186/230] Avoid circular cimports in _lib/cpp/reduce.pxd
 (#14125)

This Cython modules contains some cimports from higher-level modules than it should, which introduces the possibility for circular import issues. Also it contains an unused import of DeviceScalar that can cause similar issues.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14125
---
 python/cudf/cudf/_lib/cpp/reduce.pxd | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/cpp/reduce.pxd
index 7952c717916..997782dec6c 100644
--- a/python/cudf/cudf/_lib/cpp/reduce.pxd
+++ b/python/cudf/cudf/_lib/cpp/reduce.pxd
@@ -1,14 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport pair
 
-from cudf._lib.aggregation cimport reduce_aggregation, scan_aggregation
+from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport data_type
-from cudf._lib.scalar cimport DeviceScalar
 
 
 cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:

From 2d4f22a9ab0709f808af9253097037e0eb5d00b1 Mon Sep 17 00:00:00 2001
From: Sam Turner <98767222+stmio@users.noreply.github.com>
Date: Wed, 20 Sep 2023 13:57:26 +0100
Subject: [PATCH 187/230] Implement `GroupBy.value_counts` to match pandas API
 (#14114)

This PR implements `GroupBy.value_counts`, matching the [pandas equivalent](https://pandas.pydata.org/docs/dev/reference/api/pandas.core.groupby.DataFrameGroupBy.value_counts.html) method.

Tests currently ignore the returned Series/DataFrame's name, as this was [added to pandas in v2.0.0](https://github.com/pandas-dev/pandas/commit/bec92a43feb0057f06f4f9b9db26c1a09232b1c0). This can be removed if tests are against `pandas>=2.0.0`.

Closes #12789

Authors:
  - Sam Turner (https://github.com/stmio)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14114
---
 python/cudf/cudf/core/groupby/groupby.py | 164 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_groupby.py   |  67 +++++++++
 2 files changed, 231 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b300c55b537..e1740140b44 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2336,6 +2336,170 @@ def pct_change(
         shifted = fill_grp.shift(periods=periods, freq=freq)
         return (filled / shifted) - 1
 
+    def value_counts(
+        self,
+        subset=None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> DataFrameOrSeries:
+        """
+        Return a Series or DataFrame containing counts of unique rows.
+
+        Parameters
+        ----------
+        subset : list-like, optional
+            Columns to use when counting unique combinations.
+        normalize : bool, default False
+            Return proportions rather than frequencies.
+        sort : bool, default True
+            Sort by frequencies.
+        ascending : bool, default False
+            Sort in ascending order.
+        dropna : bool, default True
+            Don't include counts of rows that contain NA values.
+
+        Returns
+        -------
+        Series or DataFrame
+            Series if the groupby as_index is True, otherwise DataFrame.
+
+        See Also
+        --------
+        Series.value_counts: Equivalent method on Series.
+        DataFrame.value_counts: Equivalent method on DataFrame.
+        SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
+
+        Notes
+        -----
+        - If the groupby as_index is True then the returned Series will have a
+          MultiIndex with one level per input column.
+        - If the groupby as_index is False then the returned DataFrame will
+          have an additional column with the value_counts. The column is
+          labelled 'count' or 'proportion', depending on the ``normalize``
+          parameter.
+
+        By default, rows that contain any NA values are omitted from
+        the result.
+
+        By default, the result will be in descending order so that the
+        first element of each group is the most frequently-occurring row.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({
+        ...    'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
+        ...    'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
+        ...    'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
+        ... })
+
+        >>> df
+                gender  education   country
+        0       male    low         US
+        1       male    medium      FR
+        2       female  high        US
+        3       male    low         FR
+        4       female  high        FR
+        5       male    low         FR
+
+        >>> df.groupby('gender').value_counts()
+        gender  education  country
+        female  high       FR         1
+                           US         1
+        male    low        FR         2
+                           US         1
+                medium     FR         1
+        Name: count, dtype: int64
+
+        >>> df.groupby('gender').value_counts(ascending=True)
+        gender  education  country
+        female  high       FR         1
+                           US         1
+        male    low        US         1
+                medium     FR         1
+                low        FR         2
+        Name: count, dtype: int64
+
+        >>> df.groupby('gender').value_counts(normalize=True)
+        gender  education  country
+        female  high       FR         0.50
+                           US         0.50
+        male    low        FR         0.50
+                           US         0.25
+                medium     FR         0.25
+        Name: proportion, dtype: float64
+
+        >>> df.groupby('gender', as_index=False).value_counts()
+           gender education country  count
+        0  female      high      FR      1
+        1  female      high      US      1
+        2    male       low      FR      2
+        3    male       low      US      1
+        4    male    medium      FR      1
+
+        >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
+           gender education country  proportion
+        0  female      high      FR        0.50
+        1  female      high      US        0.50
+        2    male       low      FR        0.50
+        3    male       low      US        0.25
+        4    male    medium      FR        0.25
+        """
+
+        df = cudf.DataFrame.copy(self.obj)
+        groupings = self.grouping.names
+        name = "proportion" if normalize else "count"
+
+        if subset is None:
+            subset = [i for i in df._column_names if i not in groupings]
+        # Check subset exists in dataframe
+        elif set(subset) - set(df._column_names):
+            raise ValueError(
+                f"Keys {set(subset) - set(df._column_names)} in subset "
+                f"do not exist in the DataFrame."
+            )
+        # Catch case where groupby and subset share an element
+        elif set(subset) & set(groupings):
+            raise ValueError(
+                f"Keys {set(subset) & set(groupings)} in subset "
+                "cannot be in the groupby column keys."
+            )
+
+        df["__placeholder"] = 1
+        result = (
+            df.groupby(groupings + list(subset), dropna=dropna)[
+                "__placeholder"
+            ]
+            .count()
+            .sort_index()
+            .astype(np.int64)
+        )
+
+        if normalize:
+            levels = list(range(len(groupings), result.index.nlevels))
+            result /= result.groupby(
+                result.index.droplevel(levels),
+            ).transform("sum")
+
+        if sort:
+            result = result.sort_values(ascending=ascending).sort_index(
+                level=range(len(groupings)), sort_remaining=False
+            )
+
+        if not self._as_index:
+            if name in df._column_names:
+                raise ValueError(
+                    f"Column label '{name}' is duplicate of result column"
+                )
+            result.name = name
+            result = result.to_frame().reset_index()
+        else:
+            result.name = name
+
+        return result
+
     def _mimic_pandas_order(
         self, result: DataFrameOrSeries
     ) -> DataFrameOrSeries:
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 042f0e1aa38..376639d5226 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3473,3 +3473,70 @@ def test_categorical_grouping_pandas_compatibility():
     expected = pdf.groupby("key", sort=False).sum()
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("dropna", [True, False])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index):
+    # From Issue#12789
+    df = cudf.DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", np.nan, "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+    pdf = df.to_pandas()
+
+    actual = df.groupby("gender", as_index=as_index).value_counts(
+        normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
+    )
+    expected = pdf.groupby("gender", as_index=as_index).value_counts(
+        normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
+    )
+
+    # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
+    assert_groupby_results_equal(
+        actual, expected, check_names=False, check_index_type=False
+    )
+
+
+def test_group_by_value_counts_subset():
+    # From Issue#12789
+    df = cudf.DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", "high", "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+    pdf = df.to_pandas()
+
+    actual = df.groupby("gender").value_counts(["education"])
+    expected = pdf.groupby("gender").value_counts(["education"])
+
+    # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
+    assert_groupby_results_equal(
+        actual, expected, check_names=False, check_index_type=False
+    )
+
+
+def test_group_by_value_counts_clash_with_subset():
+    df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a").value_counts(["a"])
+
+
+def test_group_by_value_counts_subset_not_exists():
+    df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a").value_counts(["c"])
+
+
+def test_group_by_value_counts_with_count_column():
+    df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a", as_index=False).value_counts()

From 7b0693f6a5fd58e247a7669a813c6ffba850e4e0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 20 Sep 2023 04:46:35 -1000
Subject: [PATCH 188/230] Fix DataFrame.values with no columns but index
 (#14134)

Fixes the following

```python
In [32]: cudf.DataFrame(index=range(10)).values
Out[32]: array([], shape=(0, 0), dtype=float64)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14134
---
 python/cudf/cudf/core/frame.py           | 2 +-
 python/cudf/cudf/tests/test_dataframe.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 6224793d6f1..1e6d177f8ca 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -437,7 +437,7 @@ def get_column_values_na(col):
         ncol = self._num_columns
         if ncol == 0:
             return make_empty_matrix(
-                shape=(0, 0), dtype=np.dtype("float64"), order="F"
+                shape=(len(self), ncol), dtype=np.dtype("float64"), order="F"
             )
 
         if dtype is None:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index cbef9bfa2d8..b69f22ade81 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10374,3 +10374,9 @@ def test_dataframe_init_from_nested_dict():
     pdf = pd.DataFrame(regular_dict)
     gdf = cudf.DataFrame(regular_dict)
     assert_eq(pdf, gdf)
+
+
+def test_data_frame_values_no_cols_but_index():
+    result = cudf.DataFrame(index=range(5)).values
+    expected = pd.DataFrame(index=range(5)).values
+    assert_eq(result, expected)

From f7ca051145d41cf323cfb5a066068cb8b75d3fb3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 20 Sep 2023 10:49:06 -0500
Subject: [PATCH 189/230] Fix type of empty `Index` and raise warning in
 `Series` constructor (#14116)

Fixes: #14091
This PR fixes empty inputs dtype in `Index` to default to `str` instead of `float64`. Another change is there is a deprecation warning for `Series` constructor to match pandas.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14116
---
 python/cudf/cudf/core/algorithms.py       | 21 +++++++----
 python/cudf/cudf/core/dataframe.py        |  2 +-
 python/cudf/cudf/core/index.py            | 12 ++++++-
 python/cudf/cudf/core/series.py           | 32 +++++++++++++++--
 python/cudf/cudf/testing/_utils.py        | 21 +++++++++--
 python/cudf/cudf/tests/test_dataframe.py  | 19 +++++-----
 python/cudf/cudf/tests/test_dropna.py     |  9 +++--
 python/cudf/cudf/tests/test_duplicates.py |  4 +--
 python/cudf/cudf/tests/test_index.py      | 16 ++++++---
 python/cudf/cudf/tests/test_rolling.py    |  9 +++--
 python/cudf/cudf/tests/test_series.py     | 43 ++++++++++++++---------
 python/cudf/cudf/tests/test_stats.py      | 23 ++++++------
 12 files changed, 148 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index a472142ece0..25d58029d6b 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -4,12 +4,13 @@
 import cupy as cp
 import numpy as np
 
+from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask
-from cudf.core.index import Index, RangeIndex
+from cudf.core.index import RangeIndex, as_index
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
-from cudf.core.series import Series
 from cudf.options import get_option
+from cudf.utils.dtypes import can_convert_to_column
 
 
 def factorize(
@@ -95,7 +96,13 @@ def factorize(
 
     return_cupy_array = isinstance(values, cp.ndarray)
 
-    values = Series(values)
+    if not can_convert_to_column(values):
+        raise TypeError(
+            "'values' can only be a Series, Index, or CuPy array, "
+            f"got {type(values)}"
+        )
+
+    values = as_column(values)
 
     if na_sentinel is None:
         na_sentinel = (
@@ -128,22 +135,22 @@ def factorize(
         warnings.warn("size_hint is not applicable for cudf.factorize")
 
     if use_na_sentinel is None or use_na_sentinel:
-        cats = values._column.dropna()
+        cats = values.dropna()
     else:
-        cats = values._column
+        cats = values
 
     cats = cats.unique().astype(values.dtype)
 
     if sort:
         cats = cats.sort_values()
 
-    labels = values._column._label_encoding(
+    labels = values._label_encoding(
         cats=cats,
         na_sentinel=Scalar(na_sentinel),
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else Index(cats)
+    return labels, cats.values if return_cupy_array else as_index(cats)
 
 
 def _linear_interpolation(column, index=None):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 84c16b71997..6e664468644 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5607,7 +5607,7 @@ def quantile(
                 result.name = q
                 return result
 
-        result.index = list(map(float, qs))
+        result.index = cudf.Index(list(map(float, qs)), dtype="float64")
         return result
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 56ec9ce0359..de8a5948033 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -13,6 +13,7 @@
     List,
     MutableMapping,
     Optional,
+    Sequence,
     Tuple,
     Type,
     Union,
@@ -3467,7 +3468,7 @@ def __new__(
                 "tupleize_cols != True is not yet supported"
             )
 
-        return as_index(
+        res = as_index(
             data,
             copy=copy,
             dtype=dtype,
@@ -3475,6 +3476,15 @@ def __new__(
             nan_as_null=nan_as_null,
             **kwargs,
         )
+        if (
+            isinstance(data, Sequence)
+            and not isinstance(data, range)
+            and len(data) == 0
+            and dtype is None
+            and getattr(data, "dtype", None) is None
+        ):
+            return res.astype("str")
+        return res
 
     @classmethod
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 7692d3015f8..a195738af54 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,7 +9,16 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union
+from typing import (
+    Any,
+    Dict,
+    MutableMapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import cupy
 import numpy as np
@@ -500,6 +509,18 @@ def __init__(
         copy=False,
         nan_as_null=True,
     ):
+        if (
+            isinstance(data, Sequence)
+            and len(data) == 0
+            and dtype is None
+            and getattr(data, "dtype", None) is None
+        ):
+            warnings.warn(
+                "The default dtype for empty Series will be 'object' instead "
+                "of 'float64' in a future version. Specify a dtype explicitly "
+                "to silence this warning.",
+                FutureWarning,
+            )
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name
@@ -656,7 +677,10 @@ def from_pandas(cls, s, nan_as_null=None):
         3     NaN
         dtype: float64
         """
-        return cls(s, nan_as_null=nan_as_null)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            result = cls(s, nan_as_null=nan_as_null)
+        return result
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2642,7 +2666,9 @@ def mode(self, dropna=True):
         if len(val_counts) > 0:
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
-        return Series(val_counts.index.sort_values(), name=self.name)
+        return Series._from_data(
+            {self.name: val_counts.index.sort_values()}, name=self.name
+        )
 
     @_cudf_nvtx_annotate
     def round(self, decimals=0, how="half_even"):
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index e949f7d78e7..9182246826f 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -397,8 +397,12 @@ def assert_column_memory_ne(
     raise AssertionError("lhs and rhs holds the same memory.")
 
 
-def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
-    # Wrapper around pd.Series using a float64 default dtype for empty data.
+def _create_pandas_series_float64_default(
+    data=None, index=None, dtype=None, *args, **kwargs
+):
+    # Wrapper around pd.Series using a float64
+    # default dtype for empty data to silence warnings.
+    # TODO: Remove this in pandas-2.0 upgrade
     if dtype is None and (
         data is None or (not is_scalar(data) and len(data) == 0)
     ):
@@ -406,6 +410,19 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
     return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
 
 
+def _create_cudf_series_float64_default(
+    data=None, index=None, dtype=None, *args, **kwargs
+):
+    # Wrapper around cudf.Series using a float64
+    # default dtype for empty data to silence warnings.
+    # TODO: Remove this in pandas-2.0 upgrade
+    if dtype is None and (
+        data is None or (not is_scalar(data) and len(data) == 0)
+    ):
+        dtype = "float64"
+    return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
+
+
 parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
     "left_dtype,right_dtype",
     list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index b69f22ade81..bc85987c612 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -30,6 +30,7 @@
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
+    _create_cudf_series_float64_default,
     assert_eq,
     assert_exceptions_equal,
     assert_neq,
@@ -2000,8 +2001,8 @@ def test_series_shape():
 
 
 def test_series_shape_empty():
-    ps = pd.Series(dtype="float64")
-    cs = cudf.Series([])
+    ps = pd.Series([], dtype="float64")
+    cs = cudf.Series([], dtype="float64")
 
     assert ps.shape == cs.shape
 
@@ -2840,7 +2841,7 @@ def test_series_all_null(num_elements, null_type):
 @pytest.mark.parametrize("num_elements", [0, 2, 10, 100])
 def test_series_all_valid_nan(num_elements):
     data = [np.nan] * num_elements
-    sr = cudf.Series(data, nan_as_null=False)
+    sr = _create_cudf_series_float64_default(data, nan_as_null=False)
     np.testing.assert_equal(sr.null_count, 0)
 
 
@@ -4073,28 +4074,28 @@ def test_empty_dataframe_describe():
 
 
 def test_as_column_types():
-    col = column.as_column(cudf.Series([]))
+    col = column.as_column(cudf.Series([], dtype="float64"))
     assert_eq(col.dtype, np.dtype("float64"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="float64"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="float32")
+    col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32")
     assert_eq(col.dtype, np.dtype("float32"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="float32"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="str")
+    col = column.as_column(cudf.Series([], dtype="float64"), dtype="str")
     assert_eq(col.dtype, np.dtype("object"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="str"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="object")
+    col = column.as_column(cudf.Series([], dtype="float64"), dtype="object")
     assert_eq(col.dtype, np.dtype("object"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="object"))
@@ -4469,7 +4470,7 @@ def test_create_dataframe_column():
 )
 def test_series_values_host_property(data):
     pds = pd.Series(data=data, dtype=None if data else float)
-    gds = cudf.Series(data)
+    gds = _create_cudf_series_float64_default(data)
 
     np.testing.assert_array_equal(pds.values, gds.values_host)
 
@@ -4492,7 +4493,7 @@ def test_series_values_host_property(data):
 )
 def test_series_values_property(data):
     pds = pd.Series(data=data, dtype=None if data else float)
-    gds = cudf.Series(data)
+    gds = _create_cudf_series_float64_default(data)
     gds_vals = gds.values
     assert isinstance(gds_vals, cupy.ndarray)
     np.testing.assert_array_equal(gds_vals.get(), pds.values)
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index 3277e52edb3..1def6597706 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -1,11 +1,14 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
 import pytest
 
 import cudf
-from cudf.testing._utils import _create_pandas_series, assert_eq
+from cudf.testing._utils import (
+    _create_pandas_series_float64_default,
+    assert_eq,
+)
 
 
 @pytest.mark.parametrize(
@@ -22,7 +25,7 @@
 @pytest.mark.parametrize("inplace", [True, False])
 def test_dropna_series(data, nulls, inplace):
 
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
 
     if len(data) > 0:
         if nulls == "one":
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index f77e7b4d775..ddbfdf5eee2 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -10,7 +10,7 @@
 import cudf
 from cudf import concat
 from cudf.testing._utils import (
-    _create_pandas_series,
+    _create_pandas_series_float64_default,
     assert_eq,
     assert_exceptions_equal,
 )
@@ -62,7 +62,7 @@ def test_duplicated_with_misspelled_column_name(subset):
     ],
 )
 def test_drop_duplicates_series(data, keep):
-    pds = _create_pandas_series(data)
+    pds = _create_pandas_series_float64_default(data)
     gds = cudf.from_pandas(pds)
 
     assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep))
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index b3791cddce3..29232f63e90 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -30,7 +30,8 @@
     SIGNED_INTEGER_TYPES,
     SIGNED_TYPES,
     UNSIGNED_TYPES,
-    _create_pandas_series,
+    _create_cudf_series_float64_default,
+    _create_pandas_series_float64_default,
     assert_column_memory_eq,
     assert_column_memory_ne,
     assert_eq,
@@ -1006,8 +1007,8 @@ def test_index_equal_misc(data, other):
     actual = gd_data.equals(np.array(gd_other))
     assert_eq(expected, actual)
 
-    expected = pd_data.equals(_create_pandas_series(pd_other))
-    actual = gd_data.equals(cudf.Series(gd_other))
+    expected = pd_data.equals(_create_pandas_series_float64_default(pd_other))
+    actual = gd_data.equals(_create_cudf_series_float64_default(gd_other))
     assert_eq(expected, actual)
 
     expected = pd_data.astype("category").equals(pd_other)
@@ -2275,7 +2276,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
     ],
 )
 def test_isin_index(data, values):
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.index.isin(values)
@@ -2780,6 +2781,13 @@ def test_index_empty_from_pandas(request, dtype):
     assert_eq(pidx, gidx)
 
 
+def test_empty_index_init():
+    pidx = pd.Index([])
+    gidx = cudf.Index([])
+
+    assert_eq(pidx, gidx)
+
+
 @pytest.mark.parametrize(
     "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
 )
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index b4e0983a9e3..43fa83e1735 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -9,7 +9,10 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
-from cudf.testing._utils import _create_pandas_series, assert_eq
+from cudf.testing._utils import (
+    _create_pandas_series_float64_default,
+    assert_eq,
+)
 from cudf.testing.dataset_generator import rand_dataframe
 
 
@@ -55,7 +58,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center):
         elif nulls == "all":
             data = [np.nan] * len(data)
 
-    psr = _create_pandas_series(data, index=index)
+    psr = _create_pandas_series_float64_default(data, index=index)
     gsr = cudf.Series(psr)
     for window_size in range(1, len(data) + 1):
         for min_periods in range(1, window_size + 1):
@@ -313,7 +316,7 @@ def test_rolling_getitem_window():
 @pytest.mark.parametrize("center", [True, False])
 def test_rollling_series_numba_udf_basic(data, index, center):
 
-    psr = _create_pandas_series(data, index=index)
+    psr = _create_pandas_series_float64_default(data, index=index)
     gsr = cudf.from_pandas(psr)
 
     def some_func(A):
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index b1e991106ee..cfa571a0f54 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -19,7 +19,8 @@
     NUMERIC_TYPES,
     SERIES_OR_INDEX_NAMES,
     TIMEDELTA_TYPES,
-    _create_pandas_series,
+    _create_cudf_series_float64_default,
+    _create_pandas_series_float64_default,
     assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
@@ -400,8 +401,8 @@ def test_series_tolist(data):
     [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57],
 )
 def test_series_size(data):
-    psr = _create_pandas_series(data)
-    gsr = cudf.Series(data)
+    psr = _create_pandas_series_float64_default(data)
+    gsr = _create_cudf_series_float64_default(data)
 
     assert_eq(psr.size, gsr.size)
 
@@ -487,7 +488,7 @@ def test_series_describe_other_types(ps):
 )
 @pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0])
 def test_series_factorize(data, na_sentinel):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series_float64_default(data)
     psr = gsr.to_pandas()
 
     with pytest.warns(FutureWarning):
@@ -510,7 +511,7 @@ def test_series_factorize(data, na_sentinel):
 )
 @pytest.mark.parametrize("use_na_sentinel", [True, False])
 def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series_float64_default(data)
     psr = gsr.to_pandas(nullable=True)
 
     expected_labels, expected_cats = psr.factorize(
@@ -534,7 +535,7 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
 )
 @pytest.mark.parametrize("sort", [True, False])
 def test_series_factorize_sort(data, sort):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series_float64_default(data)
     psr = gsr.to_pandas(nullable=True)
 
     expected_labels, expected_cats = psr.factorize(sort=sort)
@@ -734,7 +735,7 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize):
             ],
             dtype="datetime64[ns]",
         ),
-        cudf.Series(name="empty series"),
+        cudf.Series(name="empty series", dtype="float64"),
         cudf.Series(["a", "b", "c", " ", "a", "b", "z"], dtype="category"),
     ],
 )
@@ -1415,7 +1416,7 @@ def test_series_hash_values_invalid_method():
 
 
 def test_set_index_unequal_length():
-    s = cudf.Series()
+    s = cudf.Series(dtype="float64")
     with pytest.raises(ValueError):
         s.index = [1, 2, 3]
 
@@ -1682,7 +1683,7 @@ def test_series_nunique_index(data):
     ],
 )
 def test_axes(data):
-    csr = cudf.Series(data)
+    csr = _create_cudf_series_float64_default(data)
     psr = csr.to_pandas()
 
     expected = psr.axes
@@ -1760,7 +1761,7 @@ def test_series_truncate_datetimeindex():
 )
 def test_isin_numeric(data, values):
     index = np.random.randint(0, 100, len(data))
-    psr = _create_pandas_series(data, index=index)
+    psr = _create_pandas_series_float64_default(data, index=index)
     gsr = cudf.Series.from_pandas(psr, nan_as_null=False)
 
     expected = psr.isin(values)
@@ -1820,7 +1821,7 @@ def test_fill_new_category():
     ],
 )
 def test_isin_datetime(data, values):
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
@@ -1849,7 +1850,7 @@ def test_isin_datetime(data, values):
     ],
 )
 def test_isin_string(data, values):
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
@@ -1878,7 +1879,7 @@ def test_isin_string(data, values):
     ],
 )
 def test_isin_categorical(data, values):
-    psr = _create_pandas_series(data)
+    psr = _create_pandas_series_float64_default(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
@@ -2099,7 +2100,7 @@ def test_series_to_dict(into):
     ],
 )
 def test_series_hasnans(data):
-    gs = cudf.Series(data, nan_as_null=False)
+    gs = _create_cudf_series_float64_default(data, nan_as_null=False)
     ps = gs.to_pandas(nullable=True)
 
     assert_eq(gs.hasnans, ps.hasnans)
@@ -2170,8 +2171,8 @@ def test_series_init_dict_with_index(data, index):
     "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]
 )
 def test_series_init_scalar_with_index(data, index):
-    pandas_series = _create_pandas_series(data, index=index)
-    cudf_series = cudf.Series(data, index=index)
+    pandas_series = _create_pandas_series_float64_default(data, index=index)
+    cudf_series = _create_cudf_series_float64_default(data, index=index)
 
     assert_eq(
         pandas_series,
@@ -2313,7 +2314,15 @@ def test_series_round_builtin(data, digits):
     assert_eq(expected, actual)
 
 
+def test_series_empty_warning():
+    with pytest.warns(FutureWarning):
+        expected = pd.Series([])
+    with pytest.warns(FutureWarning):
+        actual = cudf.Series([])
+    assert_eq(expected, actual)
+
+
 def test_series_count_invalid_param():
-    s = cudf.Series([])
+    s = cudf.Series([], dtype="float64")
     with pytest.raises(TypeError):
         s.count(skipna=True)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 463cdb8a7f4..3ac605a1a4d 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -10,7 +10,8 @@
 import cudf
 from cudf.datasets import randomdata
 from cudf.testing._utils import (
-    _create_pandas_series,
+    _create_cudf_series_float64_default,
+    _create_pandas_series_float64_default,
     assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
@@ -222,8 +223,8 @@ def test_approx_quantiles_int():
 )
 def test_misc_quantiles(data, q):
 
-    pdf_series = _create_pandas_series(data)
-    gdf_series = cudf.Series(data)
+    pdf_series = _create_pandas_series_float64_default(data)
+    gdf_series = _create_cudf_series_float64_default(data)
 
     expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q)
     actual = gdf_series.quantile(q)
@@ -242,7 +243,7 @@ def test_misc_quantiles(data, q):
             [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False
         ),
         cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -292,7 +293,7 @@ def test_kurt_skew_error(op):
             [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False
         ),
         cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -348,7 +349,7 @@ def test_series_median(dtype, num_na):
         np.zeros(100),
         np.array([1.123, 2.343, np.nan, 0.0]),
         np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -376,7 +377,7 @@ def test_series_pct_change(data, periods, fill_method):
         np.array([1.123, 2.343, np.nan, 0.0]),
         cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False),
         cudf.Series([1.1, 2.32, 43.4], index=[0, 4, 3]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -420,7 +421,7 @@ def test_cov1d(data1, data2):
         np.array([1.123, 2.343, np.nan, 0.0]),
         cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False),
         cudf.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]),
-        cudf.Series([]),
+        cudf.Series([], dtype="float64"),
         cudf.Series([-3]),
     ],
 )
@@ -524,14 +525,14 @@ def test_df_corr(method):
 )
 @pytest.mark.parametrize("skipna", [True, False])
 def test_nans_stats(data, ops, skipna):
-    psr = _create_pandas_series(data)
-    gsr = cudf.Series(data, nan_as_null=False)
+    psr = _create_pandas_series_float64_default(data)
+    gsr = _create_cudf_series_float64_default(data, nan_as_null=False)
 
     assert_eq(
         getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna)
     )
 
-    gsr = cudf.Series(data, nan_as_null=False)
+    gsr = _create_cudf_series_float64_default(data, nan_as_null=False)
     # Since there is no concept of `nan_as_null` in pandas,
     # nulls will be returned in the operations. So only
     # testing for `skipna=True` when `nan_as_null=False`

From eb6d134d169ed077000ee7d075d5363dec066578 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 20 Sep 2023 06:49:14 -1000
Subject: [PATCH 190/230] Don't sort columns for DataFrame init from list of
 Series (#14136)

closes #14132

This PR removes the re-sorting of dataframe columns when initialized by a series list.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14136
---
 python/cudf/cudf/core/dataframe.py       |  4 +---
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6e664468644..1a780cc9e9f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7885,9 +7885,7 @@ def _get_union_of_indices(indexes):
         return indexes[0]
     else:
         merged_index = cudf.core.index.GenericIndex._concat(indexes)
-        merged_index = merged_index.drop_duplicates()
-        inds = merged_index._values.argsort()
-        return merged_index.take(inds)
+        return merged_index.drop_duplicates()
 
 
 def _get_union_of_series_names(series_list):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index bc85987c612..6180162ecdd 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -221,6 +221,18 @@ def test_init_unaligned_with_index():
     assert_eq(pdf, gdf, check_dtype=False)
 
 
+def test_init_series_list_columns_unsort():
+    pseries = [
+        pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)
+    ]
+    gseries = [
+        cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)
+    ]
+    pdf = pd.DataFrame(pseries)
+    gdf = cudf.DataFrame(gseries)
+    assert_eq(pdf, gdf)
+
+
 def test_series_basic():
     # Make series from buffer
     a1 = np.arange(10, dtype=np.float64)

From 40d4cc5565f600864c3b16f30d3d26fd4904deaf Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 20 Sep 2023 11:03:44 -0700
Subject: [PATCH 191/230] Refactor parquet thrift reader (#14097)

Refactors the current `CompactProtocolReader` used to parse parquet file metadata. The main goal of the refactor is to allow easier use of `std::optional` fields in the thrift structs to prevent situations as in #14024 where an optional field is an empty string. The writer cannot distinguish between present-but-empty and not-present, so chooses the latter when writing the field. This PR adds a `ParquetFieldOptional` functor that can wrap the other field functors, obviating the need to write a new optional functor for each type.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14097
---
 .../io/parquet/compact_protocol_reader.cpp    | 691 +++++++++++++++---
 .../io/parquet/compact_protocol_reader.hpp    | 586 +--------------
 .../io/parquet/compact_protocol_writer.cpp    |  30 +-
 .../io/parquet/compact_protocol_writer.hpp    |   3 +
 cpp/src/io/parquet/parquet.hpp                |  18 +-
 cpp/src/io/parquet/parquet_common.hpp         |   2 +-
 cpp/src/io/parquet/writer_impl.cu             |  38 +-
 7 files changed, 662 insertions(+), 706 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index ae11af92f78..5c7b8ca3f8c 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -18,27 +18,474 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <functional>
 #include <tuple>
 
 namespace cudf {
 namespace io {
 namespace parquet {
-uint8_t const CompactProtocolReader::g_list2struct[16] = {0,
-                                                          1,
-                                                          2,
-                                                          ST_FLD_BYTE,
-                                                          ST_FLD_DOUBLE,
-                                                          5,
-                                                          ST_FLD_I16,
-                                                          7,
-                                                          ST_FLD_I32,
-                                                          9,
-                                                          ST_FLD_I64,
-                                                          ST_FLD_BINARY,
-                                                          ST_FLD_STRUCT,
-                                                          ST_FLD_MAP,
-                                                          ST_FLD_SET,
-                                                          ST_FLD_LIST};
+
+/**
+ * @brief Base class for parquet field functors.
+ *
+ * Holds the field value used by all of the specialized functors.
+ */
+class parquet_field {
+ private:
+  int _field_val;
+
+ protected:
+  parquet_field(int f) : _field_val(f) {}
+
+ public:
+  virtual ~parquet_field() = default;
+  int field() const { return _field_val; }
+};
+
+/**
+ * @brief Abstract base class for list functors.
+ */
+template <typename T>
+class parquet_field_list : public parquet_field {
+ private:
+  using read_func_type = std::function<bool(uint32_t, CompactProtocolReader*)>;
+  FieldType _expected_type;
+  read_func_type _read_value;
+
+ protected:
+  std::vector<T>& val;
+
+  void bind_read_func(read_func_type fn) { _read_value = fn; }
+
+  parquet_field_list(int f, std::vector<T>& v, FieldType t)
+    : parquet_field(f), _expected_type(t), val(v)
+  {
+  }
+
+ public:
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_LIST) { return true; }
+    auto const [t, n] = cpr->get_listh();
+    if (t != _expected_type) { return true; }
+    val.resize(n);
+    for (uint32_t i = 0; i < n; i++) {
+      if (_read_value(i, cpr)) { return true; }
+    }
+    return false;
+  }
+};
+
+/**
+ * @brief Functor to set value to bool read from CompactProtocolReader
+ *
+ * bool doesn't actually encode a value, we just use the field type to indicate true/false
+ *
+ * @return True if field type is not bool
+ */
+class parquet_field_bool : public parquet_field {
+  bool& val;
+
+ public:
+  parquet_field_bool(int f, bool& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) { return true; }
+    val = field_type == ST_FLD_TRUE;
+    return false;
+  }
+};
+
+/**
+ * @brief Functor to read a vector of booleans from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * bool fails
+ */
+struct parquet_field_bool_list : public parquet_field_list<bool> {
+  parquet_field_bool_list(int f, std::vector<bool>& v) : parquet_field_list(f, v, ST_FLD_TRUE)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      auto const current_byte = cpr->getb();
+      if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) { return true; }
+      this->val[i] = current_byte == ST_FLD_TRUE;
+      return false;
+    };
+    bind_read_func(read_value);
+  }
+};
+
+/**
+ * @brief Base type for a functor that reads an integer from CompactProtocolReader
+ *
+ * Assuming signed ints since the parquet spec does not use unsigned ints anywhere.
+ *
+ * @return True if there is a type mismatch
+ */
+template <typename T, int EXPECTED_TYPE>
+class parquet_field_int : public parquet_field {
+  static constexpr bool is_byte = std::is_same_v<T, int8_t>;
+
+  T& val;
+
+ public:
+  parquet_field_int(int f, T& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if constexpr (is_byte) {
+      val = cpr->getb();
+    } else {
+      val = cpr->get_zigzag<T>();
+    }
+    return (field_type != EXPECTED_TYPE);
+  }
+};
+
+using parquet_field_int8  = parquet_field_int<int8_t, ST_FLD_BYTE>;
+using parquet_field_int32 = parquet_field_int<int32_t, ST_FLD_I32>;
+using parquet_field_int64 = parquet_field_int<int64_t, ST_FLD_I64>;
+
+/**
+ * @brief Functor to read a vector of integers from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading an
+ * integer fails
+ */
+template <typename T, FieldType EXPECTED_TYPE>
+struct parquet_field_int_list : public parquet_field_list<T> {
+  parquet_field_int_list(int f, std::vector<T>& v) : parquet_field_list<T>(f, v, EXPECTED_TYPE)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      this->val[i] = cpr->get_zigzag<T>();
+      return false;
+    };
+    this->bind_read_func(read_value);
+  }
+};
+
+using parquet_field_int64_list = parquet_field_int_list<int64_t, ST_FLD_I64>;
+
+/**
+ * @brief Functor to read a string from CompactProtocolReader
+ *
+ * @return True if field type mismatches or if size of string exceeds bounds
+ * of the CompactProtocolReader
+ */
+class parquet_field_string : public parquet_field {
+  std::string& val;
+
+ public:
+  parquet_field_string(int f, std::string& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_BINARY) { return true; }
+    auto const n = cpr->get_u32();
+    if (n < static_cast<size_t>(cpr->m_end - cpr->m_cur)) {
+      val.assign(reinterpret_cast<char const*>(cpr->m_cur), n);
+      cpr->m_cur += n;
+      return false;
+    } else {
+      return true;
+    }
+  }
+};
+
+/**
+ * @brief Functor to read a vector of strings from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * string fails
+ */
+struct parquet_field_string_list : public parquet_field_list<std::string> {
+  parquet_field_string_list(int f, std::vector<std::string>& v)
+    : parquet_field_list(f, v, ST_FLD_BINARY)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      auto const l = cpr->get_u32();
+      if (l < static_cast<size_t>(cpr->m_end - cpr->m_cur)) {
+        this->val[i].assign(reinterpret_cast<char const*>(cpr->m_cur), l);
+        cpr->m_cur += l;
+      } else {
+        return true;
+      }
+      return false;
+    };
+    bind_read_func(read_value);
+  }
+};
+
+/**
+ * @brief Functor to set value to enum read from CompactProtocolReader
+ *
+ * @return True if field type is not int32
+ */
+template <typename Enum>
+class parquet_field_enum : public parquet_field {
+  Enum& val;
+
+ public:
+  parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {}
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    val = static_cast<Enum>(cpr->get_i32());
+    return (field_type != ST_FLD_I32);
+  }
+};
+
+/**
+ * @brief Functor to read a vector of enums from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading an
+ * enum fails
+ */
+template <typename Enum>
+struct parquet_field_enum_list : public parquet_field_list<Enum> {
+  parquet_field_enum_list(int f, std::vector<Enum>& v) : parquet_field_list<Enum>(f, v, ST_FLD_I32)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      this->val[i] = static_cast<Enum>(cpr->get_i32());
+      return false;
+    };
+    this->bind_read_func(read_value);
+  }
+};
+
+/**
+ * @brief Functor to read a structure from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * struct fails
+ */
+template <typename T>
+class parquet_field_struct : public parquet_field {
+  T& val;
+
+ public:
+  parquet_field_struct(int f, T& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    return (field_type != ST_FLD_STRUCT || !(cpr->read(&val)));
+  }
+};
+
+/**
+ * @brief Functor to read optional structures in unions
+ *
+ * @return True if field types mismatch
+ */
+template <typename E, typename T>
+class parquet_field_union_struct : public parquet_field {
+  E& enum_val;
+  thrust::optional<T>& val;  // union structs are always wrapped in std::optional
+
+ public:
+  parquet_field_union_struct(int f, E& ev, thrust::optional<T>& v)
+    : parquet_field(f), enum_val(ev), val(v)
+  {
+  }
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    T v;
+    bool const res = parquet_field_struct<T>(field(), v).operator()(cpr, field_type);
+    if (!res) {
+      val      = v;
+      enum_val = static_cast<E>(field());
+    }
+    return res;
+  }
+};
+
+/**
+ * @brief Functor to read empty structures in unions
+ *
+ * Added to avoid having to define read() functions for empty structs contained in unions.
+ *
+ * @return True if field types mismatch
+ */
+template <typename E>
+class parquet_field_union_enumerator : public parquet_field {
+  E& val;
+
+ public:
+  parquet_field_union_enumerator(int f, E& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_STRUCT) { return true; }
+    cpr->skip_struct_field(field_type);
+    val = static_cast<E>(field());
+    return false;
+  }
+};
+
+/**
+ * @brief Functor to read a vector of structures from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * struct fails
+ */
+template <typename T>
+struct parquet_field_struct_list : public parquet_field_list<T> {
+  parquet_field_struct_list(int f, std::vector<T>& v) : parquet_field_list<T>(f, v, ST_FLD_STRUCT)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      if (not cpr->read(&this->val[i])) { return true; }
+      return false;
+    };
+    this->bind_read_func(read_value);
+  }
+};
+
+// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields
+// in a struct
+/**
+ * @brief Functor to read a union member from CompactProtocolReader
+ *
+ * @tparam is_empty True if tparam `T` type is empty type, else false.
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * union member fails
+ */
+template <typename T, bool is_empty = false>
+class ParquetFieldUnionFunctor : public parquet_field {
+  bool& is_set;
+  T& val;
+
+ public:
+  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_STRUCT) {
+      return true;
+    } else {
+      is_set = true;
+      return !cpr->read(&val);
+    }
+  }
+};
+
+template <typename T>
+class ParquetFieldUnionFunctor<T, true> : public parquet_field {
+  bool& is_set;
+  T& val;
+
+ public:
+  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_STRUCT) {
+      return true;
+    } else {
+      is_set = true;
+      cpr->skip_struct_field(field_type);
+      return false;
+    }
+  }
+};
+
+template <typename T>
+ParquetFieldUnionFunctor<T, std::is_empty_v<T>> ParquetFieldUnion(int f, bool& b, T& v)
+{
+  return ParquetFieldUnionFunctor<T, std::is_empty_v<T>>(f, b, v);
+}
+
+/**
+ * @brief Functor to read a binary from CompactProtocolReader
+ *
+ * @return True if field type mismatches or if size of binary exceeds bounds
+ * of the CompactProtocolReader
+ */
+class parquet_field_binary : public parquet_field {
+  std::vector<uint8_t>& val;
+
+ public:
+  parquet_field_binary(int f, std::vector<uint8_t>& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_BINARY) { return true; }
+    auto const n = cpr->get_u32();
+    if (n <= static_cast<size_t>(cpr->m_end - cpr->m_cur)) {
+      val.resize(n);
+      val.assign(cpr->m_cur, cpr->m_cur + n);
+      cpr->m_cur += n;
+      return false;
+    } else {
+      return true;
+    }
+  }
+};
+
+/**
+ * @brief Functor to read a vector of binaries from CompactProtocolReader
+ *
+ * @return True if field types mismatch or if the process of reading a
+ * binary fails
+ */
+struct parquet_field_binary_list : public parquet_field_list<std::vector<uint8_t>> {
+  parquet_field_binary_list(int f, std::vector<std::vector<uint8_t>>& v)
+    : parquet_field_list(f, v, ST_FLD_BINARY)
+  {
+    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+      auto const l = cpr->get_u32();
+      if (l <= static_cast<size_t>(cpr->m_end - cpr->m_cur)) {
+        val[i].resize(l);
+        val[i].assign(cpr->m_cur, cpr->m_cur + l);
+        cpr->m_cur += l;
+      } else {
+        return true;
+      }
+      return false;
+    };
+    bind_read_func(read_value);
+  }
+};
+
+/**
+ * @brief Functor to read a struct from CompactProtocolReader
+ *
+ * @return True if field type mismatches
+ */
+class parquet_field_struct_blob : public parquet_field {
+  std::vector<uint8_t>& val;
+
+ public:
+  parquet_field_struct_blob(int f, std::vector<uint8_t>& v) : parquet_field(f), val(v) {}
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    if (field_type != ST_FLD_STRUCT) { return true; }
+    uint8_t const* const start = cpr->m_cur;
+    cpr->skip_struct_field(field_type);
+    if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); }
+    return false;
+  }
+};
+
+/**
+ * @brief functor to wrap functors for optional fields
+ */
+template <typename T, typename FieldFunctor>
+class parquet_field_optional : public parquet_field {
+  thrust::optional<T>& val;
+
+ public:
+  parquet_field_optional(int f, thrust::optional<T>& v) : parquet_field(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    T v;
+    bool const res = FieldFunctor(field(), v).operator()(cpr, field_type);
+    if (!res) { val = v; }
+    return res;
+  }
+};
 
 /**
  * @brief Skips the number of bytes according to the specified struct type
@@ -59,22 +506,21 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth)
     case ST_FLD_BYTE: skip_bytes(1); break;
     case ST_FLD_DOUBLE: skip_bytes(8); break;
     case ST_FLD_BINARY: skip_bytes(get_u32()); break;
-    case ST_FLD_LIST:
+    case ST_FLD_LIST: [[fallthrough]];
     case ST_FLD_SET: {
-      int c = getb();
-      int n = c >> 4;
-      if (n == 0xf) n = get_i32();
-      t = g_list2struct[c & 0xf];
-      if (depth > 10) return false;
-      for (int32_t i = 0; i < n; i++)
+      auto const [t, n] = get_listh();
+      if (depth > 10) { return false; }
+      for (uint32_t i = 0; i < n; i++) {
         skip_struct_field(t, depth + 1);
+      }
     } break;
     case ST_FLD_STRUCT:
       for (;;) {
-        int c = getb();
-        t     = c & 0xf;
-        if (!c) break;
-        if (depth > 10) return false;
+        int const c = getb();
+        t           = c & 0xf;
+        if (c == 0) { break; }               // end of struct
+        if ((c & 0xf0) == 0) { get_i16(); }  // field id is not a delta
+        if (depth > 10) { return false; }
         skip_struct_field(t, depth + 1);
       }
       break;
@@ -125,11 +571,11 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>
   int field           = 0;
   while (true) {
     int const current_byte = cpr->getb();
-    if (!current_byte) break;
-    int const field_delta = current_byte >> 4;
-    int const field_type  = current_byte & 0xf;
-    field                 = field_delta ? field + field_delta : cpr->get_i16();
-    bool exit_function    = FunctionSwitchImpl<index>::run(cpr, field_type, field, op);
+    if (!current_byte) { break; }
+    int const field_delta    = current_byte >> 4;
+    int const field_type     = current_byte & 0xf;
+    field                    = field_delta ? field + field_delta : cpr->get_i16();
+    bool const exit_function = FunctionSwitchImpl<index>::run(cpr, field_type, field, op);
     if (exit_function) { return false; }
   }
   return true;
@@ -137,27 +583,30 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>
 
 bool CompactProtocolReader::read(FileMetaData* f)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, f->version),
-                            ParquetFieldStructList(2, f->schema),
-                            ParquetFieldInt64(3, f->num_rows),
-                            ParquetFieldStructList(4, f->row_groups),
-                            ParquetFieldStructList(5, f->key_value_metadata),
-                            ParquetFieldString(6, f->created_by));
+  using optional_list_column_order =
+    parquet_field_optional<std::vector<ColumnOrder>, parquet_field_struct_list<ColumnOrder>>;
+  auto op = std::make_tuple(parquet_field_int32(1, f->version),
+                            parquet_field_struct_list(2, f->schema),
+                            parquet_field_int64(3, f->num_rows),
+                            parquet_field_struct_list(4, f->row_groups),
+                            parquet_field_struct_list(5, f->key_value_metadata),
+                            parquet_field_string(6, f->created_by),
+                            optional_list_column_order(7, f->column_orders));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(SchemaElement* s)
 {
-  auto op = std::make_tuple(ParquetFieldEnum<Type>(1, s->type),
-                            ParquetFieldInt32(2, s->type_length),
-                            ParquetFieldEnum<FieldRepetitionType>(3, s->repetition_type),
-                            ParquetFieldString(4, s->name),
-                            ParquetFieldInt32(5, s->num_children),
-                            ParquetFieldEnum<ConvertedType>(6, s->converted_type),
-                            ParquetFieldInt32(7, s->decimal_scale),
-                            ParquetFieldInt32(8, s->decimal_precision),
-                            ParquetFieldOptionalInt32(9, s->field_id),
-                            ParquetFieldStruct(10, s->logical_type));
+  auto op = std::make_tuple(parquet_field_enum<Type>(1, s->type),
+                            parquet_field_int32(2, s->type_length),
+                            parquet_field_enum<FieldRepetitionType>(3, s->repetition_type),
+                            parquet_field_string(4, s->name),
+                            parquet_field_int32(5, s->num_children),
+                            parquet_field_enum<ConvertedType>(6, s->converted_type),
+                            parquet_field_int32(7, s->decimal_scale),
+                            parquet_field_int32(8, s->decimal_precision),
+                            parquet_field_optional<int32_t, parquet_field_int32>(9, s->field_id),
+                            parquet_field_struct(10, s->logical_type));
   return function_builder(this, op);
 }
 
@@ -181,21 +630,21 @@ bool CompactProtocolReader::read(LogicalType* l)
 
 bool CompactProtocolReader::read(DecimalType* d)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, d->scale), ParquetFieldInt32(2, d->precision));
+  auto op = std::make_tuple(parquet_field_int32(1, d->scale), parquet_field_int32(2, d->precision));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(TimeType* t)
 {
   auto op =
-    std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit));
+    std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(TimestampType* t)
 {
   auto op =
-    std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit));
+    std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit));
   return function_builder(this, op);
 }
 
@@ -209,123 +658,129 @@ bool CompactProtocolReader::read(TimeUnit* u)
 
 bool CompactProtocolReader::read(IntType* i)
 {
-  auto op = std::make_tuple(ParquetFieldInt8(1, i->bitWidth), ParquetFieldBool(2, i->isSigned));
+  auto op = std::make_tuple(parquet_field_int8(1, i->bitWidth), parquet_field_bool(2, i->isSigned));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(RowGroup* r)
 {
-  auto op = std::make_tuple(ParquetFieldStructList(1, r->columns),
-                            ParquetFieldInt64(2, r->total_byte_size),
-                            ParquetFieldInt64(3, r->num_rows));
+  auto op = std::make_tuple(parquet_field_struct_list(1, r->columns),
+                            parquet_field_int64(2, r->total_byte_size),
+                            parquet_field_int64(3, r->num_rows));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(ColumnChunk* c)
 {
-  auto op = std::make_tuple(ParquetFieldString(1, c->file_path),
-                            ParquetFieldInt64(2, c->file_offset),
-                            ParquetFieldStruct(3, c->meta_data),
-                            ParquetFieldInt64(4, c->offset_index_offset),
-                            ParquetFieldInt32(5, c->offset_index_length),
-                            ParquetFieldInt64(6, c->column_index_offset),
-                            ParquetFieldInt32(7, c->column_index_length));
+  auto op = std::make_tuple(parquet_field_string(1, c->file_path),
+                            parquet_field_int64(2, c->file_offset),
+                            parquet_field_struct(3, c->meta_data),
+                            parquet_field_int64(4, c->offset_index_offset),
+                            parquet_field_int32(5, c->offset_index_length),
+                            parquet_field_int64(6, c->column_index_offset),
+                            parquet_field_int32(7, c->column_index_length));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
-  auto op = std::make_tuple(ParquetFieldEnum<Type>(1, c->type),
-                            ParquetFieldEnumList(2, c->encodings),
-                            ParquetFieldStringList(3, c->path_in_schema),
-                            ParquetFieldEnum<Compression>(4, c->codec),
-                            ParquetFieldInt64(5, c->num_values),
-                            ParquetFieldInt64(6, c->total_uncompressed_size),
-                            ParquetFieldInt64(7, c->total_compressed_size),
-                            ParquetFieldInt64(9, c->data_page_offset),
-                            ParquetFieldInt64(10, c->index_page_offset),
-                            ParquetFieldInt64(11, c->dictionary_page_offset),
-                            ParquetFieldStruct(12, c->statistics));
+  auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
+                            parquet_field_enum_list(2, c->encodings),
+                            parquet_field_string_list(3, c->path_in_schema),
+                            parquet_field_enum<Compression>(4, c->codec),
+                            parquet_field_int64(5, c->num_values),
+                            parquet_field_int64(6, c->total_uncompressed_size),
+                            parquet_field_int64(7, c->total_compressed_size),
+                            parquet_field_int64(9, c->data_page_offset),
+                            parquet_field_int64(10, c->index_page_offset),
+                            parquet_field_int64(11, c->dictionary_page_offset),
+                            parquet_field_struct(12, c->statistics));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(PageHeader* p)
 {
-  auto op = std::make_tuple(ParquetFieldEnum<PageType>(1, p->type),
-                            ParquetFieldInt32(2, p->uncompressed_page_size),
-                            ParquetFieldInt32(3, p->compressed_page_size),
-                            ParquetFieldStruct(5, p->data_page_header),
-                            ParquetFieldStruct(7, p->dictionary_page_header),
-                            ParquetFieldStruct(8, p->data_page_header_v2));
+  auto op = std::make_tuple(parquet_field_enum<PageType>(1, p->type),
+                            parquet_field_int32(2, p->uncompressed_page_size),
+                            parquet_field_int32(3, p->compressed_page_size),
+                            parquet_field_struct(5, p->data_page_header),
+                            parquet_field_struct(7, p->dictionary_page_header),
+                            parquet_field_struct(8, p->data_page_header_v2));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(DataPageHeader* d)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
-                            ParquetFieldEnum<Encoding>(2, d->encoding),
-                            ParquetFieldEnum<Encoding>(3, d->definition_level_encoding),
-                            ParquetFieldEnum<Encoding>(4, d->repetition_level_encoding));
+  auto op = std::make_tuple(parquet_field_int32(1, d->num_values),
+                            parquet_field_enum<Encoding>(2, d->encoding),
+                            parquet_field_enum<Encoding>(3, d->definition_level_encoding),
+                            parquet_field_enum<Encoding>(4, d->repetition_level_encoding));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(DictionaryPageHeader* d)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
-                            ParquetFieldEnum<Encoding>(2, d->encoding));
+  auto op = std::make_tuple(parquet_field_int32(1, d->num_values),
+                            parquet_field_enum<Encoding>(2, d->encoding));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(DataPageHeaderV2* d)
 {
-  auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
-                            ParquetFieldInt32(2, d->num_nulls),
-                            ParquetFieldInt32(3, d->num_rows),
-                            ParquetFieldEnum<Encoding>(4, d->encoding),
-                            ParquetFieldInt32(5, d->definition_levels_byte_length),
-                            ParquetFieldInt32(6, d->repetition_levels_byte_length),
-                            ParquetFieldBool(7, d->is_compressed));
+  auto op = std::make_tuple(parquet_field_int32(1, d->num_values),
+                            parquet_field_int32(2, d->num_nulls),
+                            parquet_field_int32(3, d->num_rows),
+                            parquet_field_enum<Encoding>(4, d->encoding),
+                            parquet_field_int32(5, d->definition_levels_byte_length),
+                            parquet_field_int32(6, d->repetition_levels_byte_length),
+                            parquet_field_bool(7, d->is_compressed));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(KeyValue* k)
 {
-  auto op = std::make_tuple(ParquetFieldString(1, k->key), ParquetFieldString(2, k->value));
+  auto op = std::make_tuple(parquet_field_string(1, k->key), parquet_field_string(2, k->value));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(PageLocation* p)
 {
-  auto op = std::make_tuple(ParquetFieldInt64(1, p->offset),
-                            ParquetFieldInt32(2, p->compressed_page_size),
-                            ParquetFieldInt64(3, p->first_row_index));
+  auto op = std::make_tuple(parquet_field_int64(1, p->offset),
+                            parquet_field_int32(2, p->compressed_page_size),
+                            parquet_field_int64(3, p->first_row_index));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(OffsetIndex* o)
 {
-  auto op = std::make_tuple(ParquetFieldStructList(1, o->page_locations));
+  auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(ColumnIndex* c)
 {
-  auto op = std::make_tuple(ParquetFieldBoolList(1, c->null_pages),
-                            ParquetFieldBinaryList(2, c->min_values),
-                            ParquetFieldBinaryList(3, c->max_values),
-                            ParquetFieldEnum<BoundaryOrder>(4, c->boundary_order),
-                            ParquetFieldInt64List(5, c->null_counts));
+  auto op = std::make_tuple(parquet_field_bool_list(1, c->null_pages),
+                            parquet_field_binary_list(2, c->min_values),
+                            parquet_field_binary_list(3, c->max_values),
+                            parquet_field_enum<BoundaryOrder>(4, c->boundary_order),
+                            parquet_field_int64_list(5, c->null_counts));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(Statistics* s)
 {
-  auto op = std::make_tuple(ParquetFieldBinary(1, s->max),
-                            ParquetFieldBinary(2, s->min),
-                            ParquetFieldInt64(3, s->null_count),
-                            ParquetFieldInt64(4, s->distinct_count),
-                            ParquetFieldBinary(5, s->max_value),
-                            ParquetFieldBinary(6, s->min_value));
+  auto op = std::make_tuple(parquet_field_binary(1, s->max),
+                            parquet_field_binary(2, s->min),
+                            parquet_field_int64(3, s->null_count),
+                            parquet_field_int64(4, s->distinct_count),
+                            parquet_field_binary(5, s->max_value),
+                            parquet_field_binary(6, s->min_value));
+  return function_builder(this, op);
+}
+
+bool CompactProtocolReader::read(ColumnOrder* c)
+{
+  auto op = std::make_tuple(parquet_field_union_enumerator<ColumnOrder::Type>(1, c->type));
   return function_builder(this, op);
 }
 
@@ -338,7 +793,7 @@ bool CompactProtocolReader::read(Statistics* s)
  */
 bool CompactProtocolReader::InitSchema(FileMetaData* md)
 {
-  if (static_cast<std::size_t>(WalkSchema(md)) != md->schema.size()) return false;
+  if (static_cast<std::size_t>(WalkSchema(md)) != md->schema.size()) { return false; }
 
   /* Inside FileMetaData, there is a std::vector of RowGroups and each RowGroup contains a
    * a std::vector of ColumnChunks. Each ColumnChunk has a member ColumnMetaData, which contains
@@ -353,13 +808,15 @@ bool CompactProtocolReader::InitSchema(FileMetaData* md)
       for (auto const& path : column.meta_data.path_in_schema) {
         auto const it = [&] {
           // find_if starting at (current_schema_index + 1) and then wrapping
-          auto schema = [&](auto const& e) { return e.parent_idx == parent && e.name == path; };
-          auto mid    = md->schema.cbegin() + current_schema_index + 1;
-          auto it     = std::find_if(mid, md->schema.cend(), schema);
-          if (it != md->schema.cend()) return it;
+          auto const schema = [&](auto const& e) {
+            return e.parent_idx == parent && e.name == path;
+          };
+          auto const mid = md->schema.cbegin() + current_schema_index + 1;
+          auto const it  = std::find_if(mid, md->schema.cend(), schema);
+          if (it != md->schema.cend()) { return it; }
           return std::find_if(md->schema.cbegin(), mid, schema);
         }();
-        if (it == md->schema.cend()) return false;
+        if (it == md->schema.cend()) { return false; }
         current_schema_index = std::distance(md->schema.cbegin(), it);
         column.schema_idx    = current_schema_index;
         parent               = current_schema_index;
@@ -401,9 +858,9 @@ int CompactProtocolReader::WalkSchema(
     if (e->num_children > 0) {
       for (int i = 0; i < e->num_children; i++) {
         e->children_idx.push_back(idx);
-        int idx_old = idx;
-        idx         = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level);
-        if (idx <= idx_old) break;  // Error
+        int const idx_old = idx;
+        idx               = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level);
+        if (idx <= idx_old) { break; }  // Error
       }
     }
     return idx;
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 62ccacaac37..619815db503 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -22,6 +22,7 @@
 #include <cstddef>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -40,9 +41,6 @@ namespace parquet {
  * compression codecs are supported yet.
  */
 class CompactProtocolReader {
- protected:
-  static const uint8_t g_list2struct[16];
-
  public:
   explicit CompactProtocolReader(uint8_t const* base = nullptr, size_t len = 0) { init(base, len); }
   void init(uint8_t const* base, size_t len)
@@ -57,45 +55,46 @@ class CompactProtocolReader {
     bytecnt = std::min(bytecnt, (size_t)(m_end - m_cur));
     m_cur += bytecnt;
   }
-  uint32_t get_u32() noexcept
+
+  // returns a varint encoded integer
+  template <typename T>
+  T get_varint() noexcept
   {
-    uint32_t v = 0;
+    T v = 0;
     for (uint32_t l = 0;; l += 7) {
-      uint32_t c = getb();
+      T c = getb();
       v |= (c & 0x7f) << l;
-      if (c < 0x80) break;
+      if (c < 0x80) { break; }
     }
     return v;
   }
-  uint64_t get_u64() noexcept
-  {
-    uint64_t v = 0;
-    for (uint64_t l = 0;; l += 7) {
-      uint64_t c = getb();
-      v |= (c & 0x7f) << l;
-      if (c < 0x80) break;
-    }
-    return v;
-  }
-  int32_t get_i16() noexcept { return get_i32(); }
-  int32_t get_i32() noexcept
-  {
-    uint32_t u = get_u32();
-    return (int32_t)((u >> 1u) ^ -(int32_t)(u & 1));
-  }
-  int64_t get_i64() noexcept
+
+  // returns a zigzag encoded signed integer
+  template <typename T>
+  T get_zigzag() noexcept
   {
-    uint64_t u = get_u64();
-    return (int64_t)((u >> 1u) ^ -(int64_t)(u & 1));
+    using U   = std::make_unsigned_t<T>;
+    U const u = get_varint<U>();
+    return static_cast<T>((u >> 1u) ^ -static_cast<T>(u & 1));
   }
-  int32_t get_listh(uint8_t* el_type) noexcept
+
+  // thrift spec says to use zigzag i32 for i16 types
+  int32_t get_i16() noexcept { return get_zigzag<int32_t>(); }
+  int32_t get_i32() noexcept { return get_zigzag<int32_t>(); }
+  int64_t get_i64() noexcept { return get_zigzag<int64_t>(); }
+
+  uint32_t get_u32() noexcept { return get_varint<uint32_t>(); }
+  uint64_t get_u64() noexcept { return get_varint<uint64_t>(); }
+
+  [[nodiscard]] std::pair<uint8_t, uint32_t> get_listh() noexcept
   {
-    uint32_t c = getb();
-    int32_t sz = c >> 4;
-    *el_type   = c & 0xf;
-    if (sz == 0xf) sz = get_u32();
-    return sz;
+    uint32_t const c = getb();
+    uint32_t sz      = c >> 4;
+    uint8_t t        = c & 0xf;
+    if (sz == 0xf) { sz = get_u32(); }
+    return {t, sz};
   }
+
   bool skip_struct_field(int t, int depth = 0);
 
  public:
@@ -120,6 +119,7 @@ class CompactProtocolReader {
   bool read(OffsetIndex* o);
   bool read(ColumnIndex* c);
   bool read(Statistics* s);
+  bool read(ColumnOrder* c);
 
  public:
   static int NumRequiredBits(uint32_t max_level) noexcept
@@ -140,523 +140,11 @@ class CompactProtocolReader {
   uint8_t const* m_cur  = nullptr;
   uint8_t const* m_end  = nullptr;
 
-  friend class ParquetFieldBool;
-  friend class ParquetFieldBoolList;
-  friend class ParquetFieldInt8;
-  friend class ParquetFieldInt32;
-  friend class ParquetFieldOptionalInt32;
-  friend class ParquetFieldInt64;
-  friend class ParquetFieldInt64List;
-  template <typename T>
-  friend class ParquetFieldStructListFunctor;
-  friend class ParquetFieldString;
-  template <typename T>
-  friend class ParquetFieldStructFunctor;
-  template <typename T, bool>
-  friend class ParquetFieldUnionFunctor;
-  template <typename T>
-  friend class ParquetFieldEnum;
-  template <typename T>
-  friend class ParquetFieldEnumListFunctor;
-  friend class ParquetFieldStringList;
-  friend class ParquetFieldBinary;
-  friend class ParquetFieldBinaryList;
-  friend class ParquetFieldStructBlob;
-};
-
-/**
- * @brief Functor to set value to bool read from CompactProtocolReader
- *
- * @return True if field type is not bool
- */
-class ParquetFieldBool {
-  int field_val;
-  bool& val;
-
- public:
-  ParquetFieldBool(int f, bool& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    return (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) ||
-           !(val = (field_type == ST_FLD_TRUE), true);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of booleans from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * bool fails
- */
-class ParquetFieldBoolList {
-  int field_val;
-  std::vector<bool>& val;
-
- public:
-  ParquetFieldBoolList(int f, std::vector<bool>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    uint8_t t;
-    int32_t n = cpr->get_listh(&t);
-    if (t != ST_FLD_TRUE) return true;
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      unsigned int current_byte = cpr->getb();
-      if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) return true;
-      val[i] = current_byte == ST_FLD_TRUE;
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to set value to 8 bit integer read from CompactProtocolReader
- *
- * @return True if field type is not int8
- */
-class ParquetFieldInt8 {
-  int field_val;
-  int8_t& val;
-
- public:
-  ParquetFieldInt8(int f, int8_t& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = cpr->getb();
-    return (field_type != ST_FLD_BYTE);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to set value to 32 bit integer read from CompactProtocolReader
- *
- * @return True if field type is not int32
- */
-class ParquetFieldInt32 {
-  int field_val;
-  int32_t& val;
-
- public:
-  ParquetFieldInt32(int f, int32_t& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = cpr->get_i32();
-    return (field_type != ST_FLD_I32);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to set value to optional 32 bit integer read from CompactProtocolReader
- *
- * @return True if field type is not int32
- */
-class ParquetFieldOptionalInt32 {
-  int field_val;
-  std::optional<int32_t>& val;
-
- public:
-  ParquetFieldOptionalInt32(int f, std::optional<int32_t>& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = cpr->get_i32();
-    return (field_type != ST_FLD_I32);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to set value to 64 bit integer read from CompactProtocolReader
- *
- * @return True if field type is not int32 or int64
- */
-class ParquetFieldInt64 {
-  int field_val;
-  int64_t& val;
-
- public:
-  ParquetFieldInt64(int f, int64_t& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = cpr->get_i64();
-    return (field_type < ST_FLD_I16 || field_type > ST_FLD_I64);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of 64-bit integers from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading an
- * int64 fails
- */
-class ParquetFieldInt64List {
-  int field_val;
-  std::vector<int64_t>& val;
-
- public:
-  ParquetFieldInt64List(int f, std::vector<int64_t>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    uint8_t t;
-    int32_t n = cpr->get_listh(&t);
-    if (t != ST_FLD_I64) return true;
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      val[i] = cpr->get_i64();
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of structures from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * struct fails
- */
-template <typename T>
-class ParquetFieldStructListFunctor {
-  int field_val;
-  std::vector<T>& val;
-
- public:
-  ParquetFieldStructListFunctor(int f, std::vector<T>& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-
-    int current_byte = cpr->getb();
-    if ((current_byte & 0xf) != ST_FLD_STRUCT) return true;
-    int n = current_byte >> 4;
-    if (n == 0xf) n = cpr->get_u32();
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      if (!(cpr->read(&val[i]))) { return true; }
-    }
-
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-ParquetFieldStructListFunctor<T> ParquetFieldStructList(int f, std::vector<T>& v)
-{
-  return ParquetFieldStructListFunctor<T>(f, v);
-}
-
-/**
- * @brief Functor to read a string from CompactProtocolReader
- *
- * @return True if field type mismatches or if size of string exceeds bounds
- * of the CompactProtocolReader
- */
-class ParquetFieldString {
-  int field_val;
-  std::string& val;
-
- public:
-  ParquetFieldString(int f, std::string& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_BINARY) return true;
-    uint32_t n = cpr->get_u32();
-    if (n < (size_t)(cpr->m_end - cpr->m_cur)) {
-      val.assign((char const*)cpr->m_cur, n);
-      cpr->m_cur += n;
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a structure from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * struct fails
- */
-template <typename T>
-class ParquetFieldStructFunctor {
-  int field_val;
-  T& val;
-
- public:
-  ParquetFieldStructFunctor(int f, T& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    return (field_type != ST_FLD_STRUCT || !(cpr->read(&val)));
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-ParquetFieldStructFunctor<T> ParquetFieldStruct(int f, T& v)
-{
-  return ParquetFieldStructFunctor<T>(f, v);
-}
-
-/**
- * @brief Functor to read a union member from CompactProtocolReader
- *
- * @tparam is_empty True if tparam `T` type is empty type, else false.
- *
- * @return True if field types mismatch or if the process of reading a
- * union member fails
- */
-template <typename T, bool is_empty = false>
-class ParquetFieldUnionFunctor {
-  int field_val;
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      return !cpr->read(&val);
-    }
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-struct ParquetFieldUnionFunctor<T, true> {
-  int field_val;
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      cpr->skip_struct_field(field_type);
-      return false;
-    }
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-ParquetFieldUnionFunctor<T, std::is_empty_v<T>> ParquetFieldUnion(int f, bool& b, T& v)
-{
-  return ParquetFieldUnionFunctor<T, std::is_empty_v<T>>(f, b, v);
-}
-
-/**
- * @brief Functor to set value to enum read from CompactProtocolReader
- *
- * @return True if field type is not int32
- */
-template <typename Enum>
-class ParquetFieldEnum {
-  int field_val;
-  Enum& val;
-
- public:
-  ParquetFieldEnum(int f, Enum& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    val = static_cast<Enum>(cpr->get_i32());
-    return (field_type != ST_FLD_I32);
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of enums from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading an
- * enum fails
- */
-template <typename Enum>
-class ParquetFieldEnumListFunctor {
-  int field_val;
-  std::vector<Enum>& val;
-
- public:
-  ParquetFieldEnumListFunctor(int f, std::vector<Enum>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    int current_byte = cpr->getb();
-    if ((current_byte & 0xf) != ST_FLD_I32) return true;
-    int n = current_byte >> 4;
-    if (n == 0xf) n = cpr->get_u32();
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      val[i] = static_cast<Enum>(cpr->get_i32());
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-template <typename T>
-ParquetFieldEnumListFunctor<T> ParquetFieldEnumList(int field, std::vector<T>& v)
-{
-  return ParquetFieldEnumListFunctor<T>(field, v);
-}
-
-/**
- * @brief Functor to read a vector of strings from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * string fails
- */
-class ParquetFieldStringList {
-  int field_val;
-  std::vector<std::string>& val;
-
- public:
-  ParquetFieldStringList(int f, std::vector<std::string>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    uint8_t t;
-    int32_t n = cpr->get_listh(&t);
-    if (t != ST_FLD_BINARY) return true;
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      uint32_t l = cpr->get_u32();
-      if (l < (size_t)(cpr->m_end - cpr->m_cur)) {
-        val[i].assign((char const*)cpr->m_cur, l);
-        cpr->m_cur += l;
-      } else
-        return true;
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a binary from CompactProtocolReader
- *
- * @return True if field type mismatches or if size of binary exceeds bounds
- * of the CompactProtocolReader
- */
-class ParquetFieldBinary {
-  int field_val;
-  std::vector<uint8_t>& val;
-
- public:
-  ParquetFieldBinary(int f, std::vector<uint8_t>& v) : field_val(f), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_BINARY) return true;
-    uint32_t n = cpr->get_u32();
-    if (n <= (size_t)(cpr->m_end - cpr->m_cur)) {
-      val.resize(n);
-      val.assign(cpr->m_cur, cpr->m_cur + n);
-      cpr->m_cur += n;
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a vector of binaries from CompactProtocolReader
- *
- * @return True if field types mismatch or if the process of reading a
- * binary fails
- */
-class ParquetFieldBinaryList {
-  int field_val;
-  std::vector<std::vector<uint8_t>>& val;
-
- public:
-  ParquetFieldBinaryList(int f, std::vector<std::vector<uint8_t>>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_LIST) return true;
-    uint8_t t;
-    int32_t n = cpr->get_listh(&t);
-    if (t != ST_FLD_BINARY) return true;
-    val.resize(n);
-    for (int32_t i = 0; i < n; i++) {
-      uint32_t l = cpr->get_u32();
-      if (l <= (size_t)(cpr->m_end - cpr->m_cur)) {
-        val[i].resize(l);
-        val[i].assign(cpr->m_cur, cpr->m_cur + l);
-        cpr->m_cur += l;
-      } else
-        return true;
-    }
-    return false;
-  }
-
-  int field() { return field_val; }
-};
-
-/**
- * @brief Functor to read a struct from CompactProtocolReader
- *
- * @return True if field type mismatches
- */
-class ParquetFieldStructBlob {
-  int field_val;
-  std::vector<uint8_t>& val;
-
- public:
-  ParquetFieldStructBlob(int f, std::vector<uint8_t>& v) : field_val(f), val(v) {}
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) return true;
-    uint8_t const* start = cpr->m_cur;
-    cpr->skip_struct_field(field_type);
-    if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); }
-    return false;
-  }
-
-  int field() { return field_val; }
+  friend class parquet_field_string;
+  friend class parquet_field_string_list;
+  friend class parquet_field_binary;
+  friend class parquet_field_binary_list;
+  friend class parquet_field_struct_blob;
 };
 
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index b2c0c97c52d..60bc8984d81 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -33,18 +33,7 @@ size_t CompactProtocolWriter::write(FileMetaData const& f)
   c.field_struct_list(4, f.row_groups);
   if (not f.key_value_metadata.empty()) { c.field_struct_list(5, f.key_value_metadata); }
   if (not f.created_by.empty()) { c.field_string(6, f.created_by); }
-  if (f.column_order_listsize != 0) {
-    // Dummy list of struct containing an empty field1 struct
-    c.put_field_header(7, c.current_field(), ST_FLD_LIST);
-    c.put_byte((uint8_t)((std::min(f.column_order_listsize, 0xfu) << 4) | ST_FLD_STRUCT));
-    if (f.column_order_listsize >= 0xf) c.put_uint(f.column_order_listsize);
-    for (uint32_t i = 0; i < f.column_order_listsize; i++) {
-      c.put_field_header(1, 0, ST_FLD_STRUCT);
-      c.put_byte(0);  // ColumnOrder.field1 struct end
-      c.put_byte(0);  // ColumnOrder struct end
-    }
-    c.set_current_field(7);
-  }
+  if (f.column_orders.has_value()) { c.field_struct_list(7, f.column_orders.value()); }
   return c.value();
 }
 
@@ -233,6 +222,16 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s)
   return c.value();
 }
 
+size_t CompactProtocolWriter::write(ColumnOrder const& co)
+{
+  CompactProtocolFieldWriter c(*this);
+  switch (co) {
+    case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break;
+    default: break;
+  }
+  return c.value();
+}
+
 void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); }
 
 void CompactProtocolFieldWriter::put_byte(uint8_t const* raw, uint32_t len)
@@ -320,6 +319,13 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val)
   current_field_value = field;
 }
 
+inline void CompactProtocolFieldWriter::field_empty_struct(int field)
+{
+  put_field_header(field, current_field_value, ST_FLD_STRUCT);
+  put_byte(0);  // add a stop field
+  current_field_value = field;
+}
+
 template <typename T>
 inline void CompactProtocolFieldWriter::field_struct_list(int field, std::vector<T> const& val)
 {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 8d7b0961934..26d66527aa5 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -53,6 +53,7 @@ class CompactProtocolWriter {
   size_t write(Statistics const&);
   size_t write(PageLocation const&);
   size_t write(OffsetIndex const&);
+  size_t write(ColumnOrder const&);
 
  protected:
   std::vector<uint8_t>& m_buf;
@@ -94,6 +95,8 @@ class CompactProtocolFieldWriter {
   template <typename T>
   inline void field_struct(int field, T const& val);
 
+  inline void field_empty_struct(int field);
+
   template <typename T>
   inline void field_struct_list(int field, std::vector<T> const& val);
 
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index f7318bb9935..c2affc774c2 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet_common.hpp"
 
+#include <thrust/optional.h>
+
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -118,6 +120,16 @@ struct LogicalType {
   BsonType BSON;
 };
 
+/**
+ * Union to specify the order used for the min_value and max_value fields for a column.
+ */
+struct ColumnOrder {
+  enum Type { UNDEFINED, TYPE_ORDER };
+  Type type;
+
+  operator Type() const { return type; }
+};
+
 /**
  * @brief Struct for describing an element/field in the Parquet format schema
  *
@@ -135,7 +147,7 @@ struct SchemaElement {
   int32_t num_children                = 0;
   int32_t decimal_scale               = 0;
   int32_t decimal_precision           = 0;
-  std::optional<int32_t> field_id     = std::nullopt;
+  thrust::optional<int32_t> field_id  = thrust::nullopt;
   bool output_as_byte_array           = false;
 
   // The following fields are filled in later during schema initialization
@@ -284,8 +296,8 @@ struct FileMetaData {
   int64_t num_rows = 0;
   std::vector<RowGroup> row_groups;
   std::vector<KeyValue> key_value_metadata;
-  std::string created_by         = "";
-  uint32_t column_order_listsize = 0;
+  std::string created_by = "";
+  thrust::optional<std::vector<ColumnOrder>> column_orders;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 5f8f1617cb9..5a1716bb547 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -141,7 +141,7 @@ enum BoundaryOrder {
 /**
  * @brief Thrift compact protocol struct field types
  */
-enum {
+enum FieldType {
   ST_FLD_TRUE   = 1,
   ST_FLD_FALSE  = 2,
   ST_FLD_BYTE   = 3,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index d2976a3f5d9..a124f352ee4 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -74,8 +74,11 @@ struct aggregate_writer_metadata {
     for (size_t i = 0; i < partitions.size(); ++i) {
       this->files[i].num_rows = partitions[i].num_rows;
     }
-    this->column_order_listsize =
-      (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
+
+    if (stats_granularity != statistics_freq::STATISTICS_NONE) {
+      ColumnOrder default_order = {ColumnOrder::TYPE_ORDER};
+      this->column_orders       = std::vector<ColumnOrder>(num_columns, default_order);
+    }
 
     for (size_t p = 0; p < kv_md.size(); ++p) {
       std::transform(kv_md[p].begin(),
@@ -102,13 +105,13 @@ struct aggregate_writer_metadata {
   {
     CUDF_EXPECTS(part < files.size(), "Invalid part index queried");
     FileMetaData meta{};
-    meta.version               = this->version;
-    meta.schema                = this->schema;
-    meta.num_rows              = this->files[part].num_rows;
-    meta.row_groups            = this->files[part].row_groups;
-    meta.key_value_metadata    = this->files[part].key_value_metadata;
-    meta.created_by            = this->created_by;
-    meta.column_order_listsize = this->column_order_listsize;
+    meta.version            = this->version;
+    meta.schema             = this->schema;
+    meta.num_rows           = this->files[part].num_rows;
+    meta.row_groups         = this->files[part].row_groups;
+    meta.key_value_metadata = this->files[part].key_value_metadata;
+    meta.created_by         = this->created_by;
+    meta.column_orders      = this->column_orders;
     return meta;
   }
 
@@ -170,8 +173,8 @@ struct aggregate_writer_metadata {
     std::vector<std::vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
-  std::string created_by         = "";
-  uint32_t column_order_listsize = 0;
+  std::string created_by                                   = "";
+  thrust::optional<std::vector<ColumnOrder>> column_orders = thrust::nullopt;
 };
 
 namespace {
@@ -2373,20 +2376,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
       md.num_rows += tmp.num_rows;
     }
   }
-  // Reader doesn't currently populate column_order, so infer it here
-  if (not md.row_groups.empty()) {
-    auto const is_valid_stats = [](auto const& stats) {
-      return not stats.max.empty() || not stats.min.empty() || stats.null_count != -1 ||
-             stats.distinct_count != -1 || not stats.max_value.empty() ||
-             not stats.min_value.empty();
-    };
 
-    uint32_t num_columns = static_cast<uint32_t>(md.row_groups[0].columns.size());
-    md.column_order_listsize =
-      (num_columns > 0 && is_valid_stats(md.row_groups[0].columns[0].meta_data.statistics))
-        ? num_columns
-        : 0;
-  }
   // Thrift-encode the resulting output
   file_header_s fhdr;
   file_ender_s fendr;

From e87d2fc1df6105d802b300bad19a9937f8155613 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 20 Sep 2023 21:18:31 +0100
Subject: [PATCH 192/230] Reduce memory usage of as_categorical_column (#14138)

The main culprit is in the way the codes returned from _label_encoding were being ordered. We were generating an int64 column for the order, gathering through the left gather map, and then argsorting, before using that ordering as a gather map for the codes.

We note that gather(y, with=argsort(x)) is equivalent to sort_by_key(y, with=x) so use that instead (avoiding an unnecessary gather). Furthermore we also note that gather([0..n), with=x) is just equivalent to x, so we can avoid a gather too.

This reduces the peak memory footprint of categorifying a random column of 500_000_000 int32 values where there are 100 unique values from 24.75 GiB to 11.67 GiB.

### Test code

```python
import cudf
import cupy as cp

K = 100
N = 500_000_000
rng = cp.random._generator.RandomState()
column = cudf.core.column.as_column(rng.choice(cp.arange(K, dtype="int32"), size=(N,), replace=True))
column = column.astype("category", ordered=False)
```

### Before

![Screenshot from 2023-09-20 14-49-27](https://github.com/rapidsai/cudf/assets/1126981/08782501-c233-4efd-b4d6-a378cea82a82)

### After

![Screenshot from 2023-09-20 14-49-42](https://github.com/rapidsai/cudf/assets/1126981/93193bfb-a93e-45bf-8e5a-24289efc77c4)

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/14138
---
 python/cudf/cudf/core/column/column.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d2e2f11a12e..0bc50a521e2 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1390,20 +1390,19 @@ def _return_sentinel_column():
         except ValueError:
             return _return_sentinel_column()
 
-        codes = arange(len(cats), dtype=dtype)
         left_gather_map, right_gather_map = cpp_join(
             [self], [cats], how="left"
         )
-        codes = codes.take(
-            right_gather_map, nullify=True, check_bounds=False
-        ).fillna(na_sentinel.value)
-
+        codes = libcudf.copying.gather(
+            [arange(len(cats), dtype=dtype)], right_gather_map, nullify=True
+        )
+        del right_gather_map
         # reorder `codes` so that its values correspond to the
         # values of `self`:
-        order = arange(len(self))
-        order = order.take(left_gather_map, check_bounds=False).argsort()
-        codes = codes.take(order)
-        return codes
+        (codes,) = libcudf.sort.sort_by_key(
+            codes, [left_gather_map], [True], ["last"], stable=True
+        )
+        return codes.fillna(na_sentinel.value)
 
 
 def column_empty_like(

From fe99e4baa3a7cd0f87658bf1ea77b17ec61fd7dc Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:42:32 -0400
Subject: [PATCH 193/230] Expose stream parameter in public strings find APIs
 (#14060)

Add stream parameter to public APIs:

- `cudf::strings::find()`
- `cudf::strings::rfind()`
- `cudf::strings::contains()`
- `cudf::strings::starts_with()`
- `cudf::strings::ends_with()`
- `cudf::strings::findall()`
- `cudf::strings::find_multiple()`

Also cleaned up some of the doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14060
---
 cpp/include/cudf/strings/find.hpp          | 102 ++++++++++++---------
 cpp/include/cudf/strings/find_multiple.hpp |  12 ++-
 cpp/include/cudf/strings/findall.hpp       |   2 +
 cpp/src/strings/search/find.cu             |  24 +++--
 cpp/src/strings/search/find_multiple.cu    |   7 +-
 cpp/src/strings/search/findall.cu          |   3 +-
 cpp/tests/CMakeLists.txt                   |   5 +-
 cpp/tests/streams/strings/find_test.cpp    |  49 ++++++++++
 8 files changed, 143 insertions(+), 61 deletions(-)
 create mode 100644 cpp/tests/streams/strings/find_test.cpp

diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index 2fed36862b9..c1aa8b294b3 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -43,19 +43,21 @@ namespace strings {
  *
  * @throw cudf::logic_error if start position is greater than stop position.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param start First character position to include in the search.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param start First character position to include in the search
  * @param stop Last position (exclusive) to include in the search.
  *             Default of -1 will search to the end of the string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New integer column with character position values.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New integer column with character position values
  */
 std::unique_ptr<column> find(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
   size_type start                     = 0,
   size_type stop                      = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -72,19 +74,21 @@ std::unique_ptr<column> find(
  *
  * @throw cudf::logic_error if start position is greater than stop position.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param start First position to include in the search.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param start First position to include in the search
  * @param stop Last position (exclusive) to include in the search.
  *             Default of -1 will search starting at the end of the string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New integer column with character position values.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New integer column with character position values
  */
 std::unique_ptr<column> rfind(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
   size_type start                     = 0,
   size_type stop                      = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -123,37 +127,41 @@ std::unique_ptr<column> find(
  *
  * Any null string entries return corresponding null entries in the output columns.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> contains(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
  * the corresponding target string was found within that string in the provided column.
  *
- * The 'output[i] = true` if string `targets[i]` is found inside `strings[i]` otherwise
+ * The 'output[i] = true` if string `targets[i]` is found inside `input[i]` otherwise
  * `output[i] = false`.
  * If `target[i]` is an empty string, true is returned for `output[i]`.
  * If `target[i]` is null, false is returned for `output[i]`.
  *
- * Any null `strings[i]` row results in a null `output[i]` row.
+ * Any null string entries return corresponding null entries in the output columns.
  *
  * @throw cudf::logic_error if `strings.size() != targets.size()`.
  *
- * @param strings Strings instance for this operation.
- * @param targets Strings column of targets to check row-wise in `strings`.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param targets Strings column of targets to check row-wise in `strings`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> contains(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -166,14 +174,16 @@ std::unique_ptr<column> contains(
  *
  * Any null string entries return corresponding null entries in the output columns.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New type_id::BOOL8 column.
  */
 std::unique_ptr<column> starts_with(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -190,14 +200,16 @@ std::unique_ptr<column> starts_with(
  *
  * @throw cudf::logic_error if `strings.size() != targets.size()`.
  *
- * @param strings Strings instance for this operation.
- * @param targets Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param targets Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> starts_with(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -210,14 +222,16 @@ std::unique_ptr<column> starts_with(
  *
  * Any null string entries return corresponding null entries in the output columns.
  *
- * @param strings Strings instance for this operation.
- * @param target UTF-8 encoded string to search for in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param target UTF-8 encoded string to search for in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> ends_with(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -234,14 +248,16 @@ std::unique_ptr<column> ends_with(
  *
  * @throw cudf::logic_error if `strings.size() != targets.size()`.
  *
- * @param strings Strings instance for this operation.
- * @param targets Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New type_id::BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param targets Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> ends_with(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 21cfdb15146..06b851c5012 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,14 +48,16 @@ namespace strings {
  *
  * @throw cudf::logic_error if `targets` is empty or contains nulls
  *
- * @param input Strings instance for this operation.
- * @param targets Strings to search for in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Lists column with character position values.
+ * @param input Strings instance for this operation
+ * @param targets Strings to search for in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Lists column with character position values
  */
 std::unique_ptr<column> find_multiple(
   strings_column_view const& input,
   strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 745f0fc19ff..379b9624dc6 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -57,12 +57,14 @@ struct regex_program;
  *
  * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New lists column of strings
  */
 std::unique_ptr<column> findall(
   strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 3de9dd34d83..1299e552565 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -305,20 +305,22 @@ std::unique_ptr<column> find(strings_column_view const& strings,
                              string_scalar const& target,
                              size_type start,
                              size_type stop,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find(strings, target, start, stop, cudf::get_default_stream(), mr);
+  return detail::find(strings, target, start, stop, stream, mr);
 }
 
 std::unique_ptr<column> rfind(strings_column_view const& strings,
                               string_scalar const& target,
                               size_type start,
                               size_type stop,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rfind(strings, target, start, stop, cudf::get_default_stream(), mr);
+  return detail::rfind(strings, target, start, stop, stream, mr);
 }
 
 std::unique_ptr<column> find(strings_column_view const& input,
@@ -618,50 +620,56 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  string_scalar const& target,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(strings, target, cudf::get_default_stream(), mr);
+  return detail::contains(strings, target, stream, mr);
 }
 
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  strings_column_view const& targets,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(strings, targets, cudf::get_default_stream(), mr);
+  return detail::contains(strings, targets, stream, mr);
 }
 
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     string_scalar const& target,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::starts_with(strings, target, cudf::get_default_stream(), mr);
+  return detail::starts_with(strings, target, stream, mr);
 }
 
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     strings_column_view const& targets,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::starts_with(strings, targets, cudf::get_default_stream(), mr);
+  return detail::starts_with(strings, targets, stream, mr);
 }
 
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   string_scalar const& target,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ends_with(strings, target, cudf::get_default_stream(), mr);
+  return detail::ends_with(strings, target, stream, mr);
 }
 
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   strings_column_view const& targets,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ends_with(strings, targets, cudf::get_default_stream(), mr);
+  return detail::ends_with(strings, targets, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index 4a823ad1dcb..fcaec835f4d 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -70,8 +70,8 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
   results->set_null_count(0);
 
   auto offsets = cudf::detail::sequence(strings_count + 1,
-                                        numeric_scalar<size_type>(0),
-                                        numeric_scalar<size_type>(targets_count),
+                                        numeric_scalar<size_type>(0, true, stream),
+                                        numeric_scalar<size_type>(targets_count, true, stream),
                                         stream,
                                         mr);
   return make_lists_column(strings_count,
@@ -88,10 +88,11 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
 // external API
 std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find_multiple(input, targets, cudf::get_default_stream(), mr);
+  return detail::find_multiple(input, targets, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 2df64c6a0a7..acea4ff1c51 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -134,10 +134,11 @@ std::unique_ptr<column> findall(strings_column_view const& input,
 
 std::unique_ptr<column> findall(strings_column_view const& input,
                                 regex_program const& prog,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::findall(input, prog, cudf::get_default_stream(), mr);
+  return detail::findall(input, prog, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4923ef5c903..6414962903e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -627,7 +627,10 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_STRINGS_TEST streams/strings/case_test.cpp STREAM_MODE testing)
+ConfigureTest(
+  STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
+  testing
+)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp
new file mode 100644
index 00000000000..b734a1738cc
--- /dev/null
+++ b/cpp/tests/streams/strings/find_test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/find.hpp>
+#include <cudf/strings/find_multiple.hpp>
+#include <cudf/strings/findall.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+
+class StringsFindTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsFindTest, Find)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream());
+  cudf::strings::find(view, target, 0, -1, cudf::test::get_default_stream());
+  cudf::strings::rfind(view, target, 0, -1, cudf::test::get_default_stream());
+  cudf::strings::find(view, view, 0, cudf::test::get_default_stream());
+  cudf::strings::find_multiple(view, view, cudf::test::get_default_stream());
+  cudf::strings::contains(view, target, cudf::test::get_default_stream());
+  cudf::strings::starts_with(view, target, cudf::test::get_default_stream());
+  cudf::strings::starts_with(view, view, cudf::test::get_default_stream());
+  cudf::strings::ends_with(view, target, cudf::test::get_default_stream());
+  cudf::strings::ends_with(view, view, cudf::test::get_default_stream());
+
+  auto const pattern = std::string("[a-z]");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::findall(view, *prog, cudf::test::get_default_stream());
+}

From 05ee2604d8f4e7c6525d12926100e2b11b6d6cb0 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:45:11 -0400
Subject: [PATCH 194/230] Fix kernel launch error for
 cudf::io::orc::gpu::rowgroup_char_counts_kernel (#14139)

Fixes memcheck error found during the nightly builds found in gtest `OrcWriterNumericTypeTest/0.SingleColumn`

```
# compute-sanitizer --tool memcheck gtests/ORC_TEST --gtest_filter=OrcWriterNumericTypeTest/0.SingleColumn --rmm_mode=cuda
========= COMPUTE-SANITIZER
Note: Google Test filter = OrcWriterNumericTypeTest/0.SingleColumn
[==========] Running 1 test from 1 test suite.
[----------] Global test environment set-up.
[----------] 1 test from OrcWriterNumericTypeTest/0, where TypeParam = signed char
[ RUN      ] OrcWriterNumericTypeTest/0.SingleColumn
========= Program hit cudaErrorInvalidConfiguration (error 9) due to "invalid configuration argument" on CUDA API call to cudaLaunchKernel.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x480aa6]
=========                in /usr/lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:cudaLaunchKernel [0x6c358]
=========                in /conda/envs/rapids/lib/libcudart.so.11.0
=========     Host Frame:__device_stub__ZN4cudf2io3orc3gpu27rowgroup_char_counts_kernelENS_6detail11base_2dspanIiNS_11device_spanEEENS5_IKNS1_22orc_column_device_viewELm18446744073709551615EEENS4_IKNS1_13rowgroup_rowsES5_EENS5_IKjLm18446744073709551615EEE(cudf::detail::base_2dspan<int, cudf::device_span>&, cudf::device_span<cudf::io::orc::orc_column_device_view const, 18446744073709551615ul>&, cudf::detail::base_2dspan<cudf::io::orc::rowgroup_rows const, cudf::device_span>&, cudf::device_span<unsigned int const, 18446744073709551615ul>&) [0x14fccb4]

```

Adds a check to avoid the kernel launch if the number of strings column is zero.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14139
---
 cpp/src/io/orc/dict_enc.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 0007530a5af..1d2262a1ccc 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -60,6 +60,7 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
 
   auto const num_rowgroups = rowgroup_bounds.size().first;
   auto const num_str_cols  = str_col_indexes.size();
+  if (num_str_cols == 0) { return; }
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required

From ec744de69d88ada46d744c5121e137c817cb2709 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 21 Sep 2023 11:45:13 -0700
Subject: [PATCH 195/230] Support negative preceding/following for ROW window
 functions (#14093)

This commit adds support for "offset" ROW windows, where the preceding and following
window bounds are allowed to have negative values.  This allows window definitions to
exclude the current row entirely.

Prior to this change, ROW-based windows *had* to include the current row, causing
`preceding` and `following` to support only non-negative values.  Additionally, the
inclusion of the current row would count against the `min_periods` check.

The following is an example of the new "negative" semantics.  Consider the input:
```c++
auto const row = ints_column{1, 2, 3, 4};
```
If the window bounds are specified as (preceding=3, following=-1), then the window
for the third row (`3`) is `{1, 2}`.
`following=-1` indicates a "following" row *before* the current row.

A negative value for `preceding` follows the existing convention of including the
current row.  This makes it slightly more involved:
  1. `preceding=2` indicates *one* row before the current row.
  2. `preceding=1` indicates the current row.
  3. `preceding=0` indicates one row past (i.e. after) the current row.
  4. `preceding=-1` indicates two rows after the current row.
Et cetera.

`min_periods` checks continue to be honoured as before, but the requirement for
positive `min_periods` is dropped.  `min_periods` only need be non-negative.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/14093
---
 cpp/include/cudf/rolling.hpp                  |  24 +-
 cpp/src/rolling/detail/rolling.cuh            |  33 +-
 .../rolling/detail/rolling_fixed_window.cu    |  30 +-
 cpp/src/rolling/grouped_rolling.cu            | 188 +++++++---
 cpp/src/rolling/rolling.cu                    |   4 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/rolling/grouped_rolling_test.cpp    |   5 +-
 cpp/tests/rolling/offset_row_window_test.cpp  | 343 ++++++++++++++++++
 cpp/tests/rolling/rolling_test.cpp            |  23 +-
 9 files changed, 552 insertions(+), 99 deletions(-)
 create mode 100644 cpp/tests/rolling/offset_row_window_test.cpp

diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index efdb85691bd..ec93c709163 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -199,10 +199,30 @@ struct window_bounds {
  * column of the same type as the input. Therefore it is suggested to convert integer column types
  * (especially low-precision integers) to `FLOAT32` or `FLOAT64` before doing a rolling `MEAN`.
  *
+ * Note: `preceding_window` and `following_window` could well have negative values. This yields
+ * windows where the current row might not be included at all. For instance, consider a window
+ * defined as (preceding=3, following=-1). This produces a window from 2 (i.e. 3-1) rows preceding
+ * the current row, and 1 row *preceding* the current row. For the example above, the window for
+ * row#3 is:
+ *
+ *    [ 10,  20,  10,  50,  60,  20,  30,  80,  40 ]
+ *      <--window-->   ^
+ *                     |
+ *               current_row
+ *
+ * Similarly, `preceding` could have a negative value, indicating that the window begins at a
+ * position after the current row.  It differs slightly from the semantics for `following`, because
+ * `preceding` includes the current row. Therefore:
+ *   1. preceding=1  => Window starts at the current row.
+ *   2. preceding=0  => Window starts at 1 past the current row.
+ *   3. preceding=-1 => Window starts at 2 past the current row. Etc.
+ *
  * @param[in] group_keys The (pre-sorted) grouping columns
  * @param[in] input The input column (to be aggregated)
- * @param[in] preceding_window The static rolling window size in the backward direction
- * @param[in] following_window The static rolling window size in the forward direction
+ * @param[in] preceding_window The static rolling window size in the backward direction (for
+ * positive values), or forward direction (for negative values)
+ * @param[in] following_window The static rolling window size in the forward direction (for positive
+ * values), or backward direction (for negative values)
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 3b6d53f43c4..0648ef3d30f 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -70,7 +70,22 @@ namespace cudf {
 
 namespace detail {
 
-namespace {  // anonymous
+/// Helper function to materialize preceding/following offsets.
+template <typename Calculator>
+std::unique_ptr<column> expand_to_column(Calculator const& calc,
+                                         size_type const& num_rows,
+                                         rmm::cuda_stream_view stream)
+{
+  auto window_column = cudf::make_numeric_column(
+    cudf::data_type{type_to_id<size_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream);
+
+  auto begin = cudf::detail::make_counting_transform_iterator(0, calc);
+
+  thrust::copy_n(
+    rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data<size_type>());
+
+  return window_column;
+}
 
 /**
  * @brief Operator for applying a generic (non-specialized) rolling aggregation on a single window.
@@ -91,14 +106,14 @@ struct DeviceRolling {
 
   // operations we do support
   template <typename T = InputType, aggregation::Kind O = op>
-  DeviceRolling(size_type _min_periods, std::enable_if_t<is_supported<T, O>()>* = nullptr)
+  explicit DeviceRolling(size_type _min_periods, std::enable_if_t<is_supported<T, O>()>* = nullptr)
     : min_periods(_min_periods)
   {
   }
 
   // operations we don't support
   template <typename T = InputType, aggregation::Kind O = op>
-  DeviceRolling(size_type _min_periods, std::enable_if_t<!is_supported<T, O>()>* = nullptr)
+  explicit DeviceRolling(size_type _min_periods, std::enable_if_t<!is_supported<T, O>()>* = nullptr)
     : min_periods(_min_periods)
   {
     CUDF_FAIL("Invalid aggregation/type pair");
@@ -111,7 +126,7 @@ struct DeviceRolling {
                              mutable_column_device_view& output,
                              size_type start_index,
                              size_type end_index,
-                             size_type current_index)
+                             size_type current_index) const
   {
     using AggOp = typename corresponding_operator<op>::type;
     AggOp agg_op;
@@ -144,7 +159,7 @@ struct DeviceRolling {
 template <typename InputType, aggregation::Kind op>
 struct DeviceRollingArgMinMaxBase {
   size_type min_periods;
-  DeviceRollingArgMinMaxBase(size_type _min_periods) : min_periods(_min_periods) {}
+  explicit DeviceRollingArgMinMaxBase(size_type _min_periods) : min_periods(_min_periods) {}
 
   static constexpr bool is_supported()
   {
@@ -162,7 +177,7 @@ struct DeviceRollingArgMinMaxBase {
  */
 template <aggregation::Kind op>
 struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase<cudf::string_view, op> {
-  DeviceRollingArgMinMaxString(size_type _min_periods)
+  explicit DeviceRollingArgMinMaxString(size_type _min_periods)
     : DeviceRollingArgMinMaxBase<cudf::string_view, op>(_min_periods)
   {
   }
@@ -461,8 +476,8 @@ struct agg_specific_empty_output {
   }
 };
 
-std::unique_ptr<column> empty_output_for_rolling_aggregation(column_view const& input,
-                                                             rolling_aggregation const& agg)
+static std::unique_ptr<column> empty_output_for_rolling_aggregation(column_view const& input,
+                                                                    rolling_aggregation const& agg)
 {
   // TODO:
   //  Ideally, for UDF aggregations, the returned column would match
@@ -1215,8 +1230,6 @@ struct dispatch_rolling {
   }
 };
 
-}  // namespace
-
 // Applies a user-defined rolling window function to the values in a column.
 template <typename PrecedingWindowIterator, typename FollowingWindowIterator>
 std::unique_ptr<column> rolling_window_udf(column_view const& input,
diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu
index fb7b1b5f590..e951db955e5 100644
--- a/cpp/src/rolling/detail/rolling_fixed_window.cu
+++ b/cpp/src/rolling/detail/rolling_fixed_window.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,9 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <cudf_test/column_utilities.hpp>
+
 #include <thrust/extrema.h>
-#include <thrust/iterator/constant_iterator.h>
 
 namespace cudf::detail {
 
@@ -43,6 +44,9 @@ std::unique_ptr<column> rolling_window(column_view const& input,
   CUDF_EXPECTS((default_outputs.is_empty() || default_outputs.size() == input.size()),
                "Defaults column must be either empty or have as many rows as the input column.");
 
+  CUDF_EXPECTS(-(preceding_window - 1) <= following_window,
+               "Preceding window bounds must precede the following window bounds.");
+
   if (agg.kind == aggregation::CUDA || agg.kind == aggregation::PTX) {
     // TODO: In future, might need to clamp preceding/following to column boundaries.
     return cudf::detail::rolling_window_udf(input,
@@ -58,18 +62,22 @@ std::unique_ptr<column> rolling_window(column_view const& input,
     // Clamp preceding/following to column boundaries.
     // E.g. If preceding_window == 2, then for a column of 5 elements, preceding_window will be:
     //      [1, 2, 2, 2, 1]
-    auto const preceding_window_begin = cudf::detail::make_counting_transform_iterator(
-      0,
-      [preceding_window] __device__(size_type i) { return thrust::min(i + 1, preceding_window); });
-    auto const following_window_begin = cudf::detail::make_counting_transform_iterator(
-      0, [col_size = input.size(), following_window] __device__(size_type i) {
-        return thrust::min(col_size - i - 1, following_window);
-      });
 
+    auto const preceding_calc = [preceding_window] __device__(size_type i) {
+      return thrust::min(i + 1, preceding_window);
+    };
+
+    auto const following_calc = [col_size = input.size(),
+                                 following_window] __device__(size_type i) {
+      return thrust::min(col_size - i - 1, following_window);
+    };
+
+    auto const preceding_column = expand_to_column(preceding_calc, input.size(), stream);
+    auto const following_column = expand_to_column(following_calc, input.size(), stream);
     return cudf::detail::rolling_window(input,
                                         default_outputs,
-                                        preceding_window_begin,
-                                        following_window_begin,
+                                        preceding_column->view().begin<cudf::size_type>(),
+                                        following_column->view().begin<cudf::size_type>(),
                                         min_periods,
                                         agg,
                                         stream,
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index ca5c04d1c4f..6e69b5157c2 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -30,7 +30,6 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <thrust/binary_search.h>
-#include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -94,6 +93,109 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
 
 namespace detail {
 
+/// Preceding window calculation functor.
+template <bool preceding_less_than_1>
+struct row_based_preceding_calc {
+  cudf::size_type const* _group_offsets_begin;
+  cudf::size_type const* _group_labels_begin;
+  cudf::size_type const _preceding_window;
+
+  row_based_preceding_calc(rmm::device_uvector<cudf::size_type> const& group_offsets,
+                           rmm::device_uvector<cudf::size_type> const& group_labels,
+                           cudf::size_type const& preceding_window)
+    : _group_offsets_begin(group_offsets.data()),
+      _group_labels_begin(group_labels.data()),
+      _preceding_window(preceding_window)
+  {
+  }
+
+  __device__ cudf::size_type operator()(cudf::size_type const& idx) const
+  {
+    auto group_label = _group_labels_begin[idx];
+    if constexpr (preceding_less_than_1) {  // where 1 indicates only the current row.
+      auto group_end = _group_offsets_begin[group_label + 1];
+      return thrust::maximum{}(_preceding_window, -(group_end - 1 - idx));
+    } else {
+      auto group_start = _group_offsets_begin[group_label];
+      return thrust::minimum{}(_preceding_window,
+                               idx - group_start + 1);  // Preceding includes current row.
+    }
+  }
+};
+
+/// Helper to materialize preceding-window column, corrected to respect group boundaries.
+/// E.g. If preceding window == 5, then,
+///   1. For the first row in the group, the preceding is set to 1,
+///   2. For the next row in the group, preceding is set to 2, etc.
+std::unique_ptr<cudf::column> make_preceding_column(
+  rmm::device_uvector<cudf::size_type> const& group_offsets,
+  rmm::device_uvector<cudf::size_type> const& group_labels,
+  cudf::size_type const& preceding_window,
+  cudf::size_type const& num_rows,
+  rmm::cuda_stream_view stream)
+{
+  if (preceding_window < 1) {
+    auto const calc = row_based_preceding_calc<true>(group_offsets, group_labels, preceding_window);
+    return cudf::detail::expand_to_column(calc, num_rows, stream);
+  } else {
+    auto const calc =
+      row_based_preceding_calc<false>(group_offsets, group_labels, preceding_window);
+    return cudf::detail::expand_to_column(calc, num_rows, stream);
+  }
+}
+
+/// Following window calculation functor.
+template <bool following_less_than_0>
+struct row_based_following_calc {
+  cudf::size_type const* _group_offsets_begin;
+  cudf::size_type const* _group_labels_begin;
+  cudf::size_type const _following_window;
+
+  row_based_following_calc(rmm::device_uvector<cudf::size_type> const& group_offsets,
+                           rmm::device_uvector<cudf::size_type> const& group_labels,
+                           cudf::size_type const& following_window)
+    : _group_offsets_begin(group_offsets.data()),
+      _group_labels_begin(group_labels.data()),
+      _following_window(following_window)
+  {
+  }
+
+  __device__ cudf::size_type operator()(cudf::size_type const& idx) const
+  {
+    auto group_label = _group_labels_begin[idx];
+    if constexpr (following_less_than_0) {
+      auto group_start = _group_offsets_begin[group_label];
+      return thrust::maximum{}(_following_window, -(idx - group_start) - 1);
+    } else {
+      auto group_end =
+        _group_offsets_begin[group_label + 1];  // Cannot fall off the end, since offsets
+                                                // is capped with `input.size()`.
+      return thrust::minimum{}(_following_window, (group_end - 1) - idx);
+    }
+  }
+};
+
+/// Helper to materialize following-window column, corrected to respect group boundaries.
+/// i.e. If following window == 5, then:
+///   1. For the last row in the group, the following is set to 0.
+///   2. For the second last row in the group, following is set to 1, etc.
+std::unique_ptr<cudf::column> make_following_column(
+  rmm::device_uvector<cudf::size_type> const& group_offsets,
+  rmm::device_uvector<cudf::size_type> const& group_labels,
+  cudf::size_type const& following_window,
+  cudf::size_type const& num_rows,
+  rmm::cuda_stream_view stream)
+{
+  if (following_window < 0) {
+    auto const calc = row_based_following_calc<true>(group_offsets, group_labels, following_window);
+    return cudf::detail::expand_to_column(calc, num_rows, stream);
+  } else {
+    auto const calc =
+      row_based_following_calc<false>(group_offsets, group_labels, following_window);
+    return cudf::detail::expand_to_column(calc, num_rows, stream);
+  }
+}
+
 std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                column_view const& input,
                                                column_view const& default_outputs,
@@ -111,7 +213,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   CUDF_EXPECTS((group_keys.num_columns() == 0 || group_keys.num_rows() == input.size()),
                "Size mismatch between group_keys and input vector.");
 
-  CUDF_EXPECTS((min_periods > 0), "min_periods must be positive");
+  CUDF_EXPECTS((min_periods >= 0), "min_periods must be non-negative");
 
   CUDF_EXPECTS((default_outputs.is_empty() || default_outputs.size() == input.size()),
                "Defaults column must be either empty or have as many rows as the input column.");
@@ -127,6 +229,9 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   auto const preceding_window = preceding_window_bounds.value();
   auto const following_window = following_window_bounds.value();
 
+  CUDF_EXPECTS(-(preceding_window - 1) <= following_window,
+               "Preceding window bounds must precede the following window bounds.");
+
   if (group_keys.num_columns() == 0) {
     // No Groupby columns specified. Treat as one big group.
     return rolling_window(
@@ -157,24 +262,6 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
          group_offsets.element(group_offsets.size() - 1, stream) == input.size() &&
          "Must have at least one group.");
 
-  auto preceding_calculator = [d_group_offsets = group_offsets.data(),
-                               d_group_labels  = group_labels.data(),
-                               preceding_window] __device__(size_type idx) {
-    auto group_label = d_group_labels[idx];
-    auto group_start = d_group_offsets[group_label];
-    return thrust::minimum{}(preceding_window,
-                             idx - group_start + 1);  // Preceding includes current row.
-  };
-
-  auto following_calculator = [d_group_offsets = group_offsets.data(),
-                               d_group_labels  = group_labels.data(),
-                               following_window] __device__(size_type idx) {
-    auto group_label = d_group_labels[idx];
-    auto group_end   = d_group_offsets[group_label + 1];  // Cannot fall off the end, since offsets
-                                                          // is capped with `input.size()`.
-    return thrust::minimum{}(following_window, (group_end - 1) - idx);
-  };
-
   if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
     cudf::detail::preceding_window_wrapper grouped_preceding_window{
       group_offsets.data(), group_labels.data(), preceding_window};
@@ -192,15 +279,18 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                             stream,
                                             mr);
   } else {
-    return cudf::detail::rolling_window(
-      input,
-      default_outputs,
-      cudf::detail::make_counting_transform_iterator(0, preceding_calculator),
-      cudf::detail::make_counting_transform_iterator(0, following_calculator),
-      min_periods,
-      aggr,
-      stream,
-      mr);
+    auto const preceding_column =
+      make_preceding_column(group_offsets, group_labels, preceding_window, input.size(), stream);
+    auto const following_column =
+      make_following_column(group_offsets, group_labels, following_window, input.size(), stream);
+    return cudf::detail::rolling_window(input,
+                                        default_outputs,
+                                        preceding_column->view().begin<cudf::size_type>(),
+                                        following_column->view().begin<cudf::size_type>(),
+                                        min_periods,
+                                        aggr,
+                                        stream,
+                                        mr);
   }
 }
 
@@ -321,22 +411,6 @@ std::tuple<size_type, size_type> get_null_bounds_for_orderby_column(
                            : std::make_tuple(num_rows - num_nulls, num_rows);
 }
 
-template <typename Calculator>
-std::unique_ptr<column> expand_to_column(Calculator const& calc,
-                                         size_type const& num_rows,
-                                         rmm::cuda_stream_view stream)
-{
-  auto window_column = cudf::make_numeric_column(
-    cudf::data_type{type_to_id<size_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream);
-
-  auto begin = cudf::detail::make_counting_transform_iterator(0, calc);
-
-  thrust::copy_n(
-    rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data<size_type>());
-
-  return window_column;
-}
-
 /// Range window computation, with
 ///   1. no grouping keys specified
 ///   2. rows in ASCENDING order.
@@ -390,7 +464,8 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
            1;  // Add 1, for `preceding` to account for current row.
   };
 
-  auto const preceding_column = expand_to_column(preceding_calculator, input.size(), stream);
+  auto const preceding_column =
+    cudf::detail::expand_to_column(preceding_calculator, input.size(), stream);
 
   auto const following_calculator =
     [nulls_begin_idx     = h_nulls_begin_idx,
@@ -425,7 +500,8 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
            1;
   };
 
-  auto const following_column = expand_to_column(following_calculator, input.size(), stream);
+  auto const following_column =
+    cudf::detail::expand_to_column(following_calculator, input.size(), stream);
 
   return cudf::detail::rolling_window(
     input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr);
@@ -570,7 +646,8 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
            1;  // Add 1, for `preceding` to account for current row.
   };
 
-  auto const preceding_column = expand_to_column(preceding_calculator, input.size(), stream);
+  auto const preceding_column =
+    cudf::detail::expand_to_column(preceding_calculator, input.size(), stream);
 
   auto const following_calculator =
     [d_group_offsets     = group_offsets.data(),
@@ -616,7 +693,8 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
            1;
   };
 
-  auto const following_column = expand_to_column(following_calculator, input.size(), stream);
+  auto const following_column =
+    cudf::detail::expand_to_column(following_calculator, input.size(), stream);
 
   return cudf::detail::rolling_window(
     input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr);
@@ -675,7 +753,8 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
            1;  // Add 1, for `preceding` to account for current row.
   };
 
-  auto const preceding_column = expand_to_column(preceding_calculator, input.size(), stream);
+  auto const preceding_column =
+    cudf::detail::expand_to_column(preceding_calculator, input.size(), stream);
 
   auto const following_calculator =
     [nulls_begin_idx     = h_nulls_begin_idx,
@@ -710,7 +789,8 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
            1;
   };
 
-  auto const following_column = expand_to_column(following_calculator, input.size(), stream);
+  auto const following_column =
+    cudf::detail::expand_to_column(following_calculator, input.size(), stream);
 
   return cudf::detail::rolling_window(
     input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr);
@@ -774,7 +854,8 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
            1;  // Add 1, for `preceding` to account for current row.
   };
 
-  auto const preceding_column = expand_to_column(preceding_calculator, input.size(), stream);
+  auto const preceding_column =
+    cudf::detail::expand_to_column(preceding_calculator, input.size(), stream);
 
   auto const following_calculator =
     [d_group_offsets     = group_offsets.data(),
@@ -817,7 +898,8 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
            1;
   };
 
-  auto const following_column = expand_to_column(following_calculator, input.size(), stream);
+  auto const following_column =
+    cudf::detail::expand_to_column(following_calculator, input.size(), stream);
 
   if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
     CUDF_FAIL("Ranged rolling window does NOT (yet) support UDF.");
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index d699d7bea85..5c78cc4382d 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <thrust/iterator/constant_iterator.h>
-
 namespace cudf {
 
 // Applies a fixed-size rolling window function to the values in a column, with default output
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6414962903e..d1e50442058 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -453,6 +453,7 @@ ConfigureTest(
   rolling/grouped_rolling_test.cpp
   rolling/lead_lag_test.cpp
   rolling/nth_element_test.cpp
+  rolling/offset_row_window_test.cpp
   rolling/range_comparator_test.cu
   rolling/range_rolling_window_test.cpp
   rolling/range_window_bounds_test.cpp
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index 774f2f7fc40..7dd72ace53c 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -33,9 +33,6 @@
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <algorithm>
-#include <vector>
-
 const std::string cuda_func{
   R"***(
     template <typename OutType, typename InType>
@@ -637,7 +634,7 @@ TYPED_TEST(GroupedRollingTest, ZeroWindow)
                                                                          key_1_vec.end());
   const cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
 
-  cudf::size_type preceding_window = 0;
+  cudf::size_type preceding_window = 1;
   cudf::size_type following_window = 0;
   std::vector<cudf::size_type> expected_group_offsets{0, 4, 8, DATA_SIZE};
 
diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp
new file mode 100644
index 00000000000..ec726878b34
--- /dev/null
+++ b/cpp/tests/rolling/offset_row_window_test.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/lists/explode.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+template <typename T>
+using fwcw = cudf::test::fixed_width_column_wrapper<T>;
+template <typename T>
+using decimals_column = cudf::test::fixed_point_column_wrapper<T>;
+using ints_column     = fwcw<int32_t>;
+using bigints_column  = fwcw<int64_t>;
+using strings_column  = cudf::test::strings_column_wrapper;
+using lists_column    = cudf::test::lists_column_wrapper<int32_t>;
+using column_ptr      = std::unique_ptr<cudf::column>;
+using cudf::test::iterators::all_nulls;
+using cudf::test::iterators::no_nulls;
+using cudf::test::iterators::nulls_at;
+
+auto constexpr null = int32_t{0};  // NULL representation for int32_t;
+
+struct OffsetRowWindowTest : public cudf::test::BaseFixture {
+  static ints_column const _keys;    // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
+  static ints_column const _values;  // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  struct rolling_runner {
+    cudf::window_bounds _preceding, _following;
+    cudf::size_type _min_periods;
+    bool _grouped = true;
+
+    rolling_runner(cudf::window_bounds const& preceding,
+                   cudf::window_bounds const& following,
+                   cudf::size_type min_periods_ = 1)
+      : _preceding{preceding}, _following{following}, _min_periods{min_periods_}
+    {
+    }
+
+    rolling_runner& min_periods(cudf::size_type min_periods_)
+    {
+      _min_periods = min_periods_;
+      return *this;
+    }
+
+    rolling_runner& grouped(bool grouped_)
+    {
+      _grouped = grouped_;
+      return *this;
+    }
+
+    std::unique_ptr<cudf::column> operator()(cudf::rolling_aggregation const& agg) const
+    {
+      auto const grouping_keys =
+        _grouped ? std::vector<cudf::column_view>{_keys} : std::vector<cudf::column_view>{};
+      return cudf::grouped_rolling_window(
+        cudf::table_view{grouping_keys}, _values, _preceding, _following, _min_periods, agg);
+    }
+  };
+};
+
+ints_column const OffsetRowWindowTest::_keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
+ints_column const OffsetRowWindowTest::_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+auto const AGG_COUNT_NON_NULL =
+  cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::EXCLUDE);
+auto const AGG_COUNT_ALL =
+  cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE);
+auto const AGG_MIN          = cudf::make_min_aggregation<cudf::rolling_aggregation>();
+auto const AGG_MAX          = cudf::make_max_aggregation<cudf::rolling_aggregation>();
+auto const AGG_SUM          = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+auto const AGG_COLLECT_LIST = cudf::make_collect_list_aggregation<cudf::rolling_aggregation>();
+
+TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_3_to_Minus_1)
+{
+  auto const preceding = cudf::window_bounds::get(3);
+  auto const following = cudf::window_bounds::get(-1);
+  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(true);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
+                                 ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL),
+                                 ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_MIN), ints_column{{null, 0, 0, 1, 2, 3, null, 6, 6, 7}, nulls_at({0, 6})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_MAX), ints_column{{null, 0, 1, 2, 3, 4, null, 6, 7, 8}, nulls_at({0, 6})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_SUM),
+    bigints_column{{null, 0, 1, 3, 5, 7, null, 6, 13, 15}, nulls_at({0, 6})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COLLECT_LIST),
+    lists_column{{{}, {0}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {}, {6}, {6, 7}, {7, 8}},
+                 nulls_at({0, 6})});
+
+  run_rolling.min_periods(0);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
+                                 ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, no_nulls()});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL),
+                                 ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, no_nulls()});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COLLECT_LIST),
+    lists_column{{{}, {0}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {}, {6}, {6, 7}, {7, 8}}, no_nulls()});
+}
+
+TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_3_to_Minus_1)
+{
+  auto const preceding = cudf::window_bounds::get(3);
+  auto const following = cudf::window_bounds::get(-1);
+  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(false);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
+                                 ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL),
+                                 ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_MIN),
+                                 ints_column{{null, 0, 0, 1, 2, 3, 4, 5, 6, 7}, nulls_at({0})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_MAX),
+                                 ints_column{{null, 0, 1, 2, 3, 4, 5, 6, 7, 8}, nulls_at({0})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_SUM), bigints_column{{null, 0, 1, 3, 5, 7, 9, 11, 13, 15}, nulls_at({0})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COLLECT_LIST),
+    lists_column{{{}, {0}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}},
+                 nulls_at({0})});
+
+  run_rolling.min_periods(0);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
+                                 ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, no_nulls()});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL),
+                                 ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, no_nulls()});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COLLECT_LIST),
+    lists_column{{{}, {0}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}},
+                 no_nulls()});
+}
+
+TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2)
+{
+  auto const preceding = cudf::window_bounds::get(0);
+  auto const following = cudf::window_bounds::get(2);
+  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(true);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COUNT_NON_NULL),
+    ints_column{{2, 2, 2, 2, 1, null, 2, 2, 1, null}, nulls_at({5, 9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COUNT_ALL),
+    ints_column{{2, 2, 2, 2, 1, null, 2, 2, 1, null}, nulls_at({5, 9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_MIN), ints_column{{1, 2, 3, 4, 5, null, 7, 8, 9, null}, nulls_at({5, 9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_MAX), ints_column{{2, 3, 4, 5, 5, null, 8, 9, 9, null}, nulls_at({5, 9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_SUM),
+    bigints_column{{3, 5, 7, 9, 5, null, 15, 17, 9, null}, nulls_at({5, 9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COLLECT_LIST),
+    lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}},
+                 nulls_at({5, 9})});
+
+  run_rolling.min_periods(0);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
+                                 ints_column{{2, 2, 2, 2, 1, 0, 2, 2, 1, 0}, no_nulls()});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL),
+                                 ints_column{{2, 2, 2, 2, 1, 0, 2, 2, 1, 0}, no_nulls()});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COLLECT_LIST),
+    lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, no_nulls});
+}
+
+TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2)
+{
+  auto const preceding = cudf::window_bounds::get(0);
+  auto const following = cudf::window_bounds::get(2);
+  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(false);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
+                                 ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL),
+                                 ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_MIN),
+                                 ints_column{{1, 2, 3, 4, 5, 6, 7, 8, 9, null}, nulls_at({9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_MAX),
+                                 ints_column{{2, 3, 4, 5, 6, 7, 8, 9, 9, null}, nulls_at({9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_SUM), bigints_column{{3, 5, 7, 9, 11, 13, 15, 17, 9, null}, nulls_at({9})});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COLLECT_LIST),
+    lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}},
+                 nulls_at({9})});
+
+  run_rolling.min_periods(0);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
+                                 ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, 0}, no_nulls()});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_ALL),
+                                 ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, 0}, no_nulls()});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *run_rolling(*AGG_COLLECT_LIST),
+    lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}},
+                 no_nulls});
+}
+
+// To test that preceding bounds are clamped correctly at group boundaries.
+TEST_F(OffsetRowWindowTest, TestNegativeBoundsClamp)
+{
+  auto const grp_iter =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(0), [](auto const& i) {
+      return i / 10;  // 0-9 in the first group, 10-19 in the second, etc.
+    });
+  auto const agg_iter = thrust::make_constant_iterator(1);
+
+  auto const grp = ints_column(grp_iter, grp_iter + 30);
+  auto const agg = ints_column(agg_iter, agg_iter + 30);
+
+  auto const min_periods = 0;
+  auto const rolling_sum = [&](auto const preceding, auto const following) {
+    return cudf::grouped_rolling_window(
+      cudf::table_view{{grp}}, agg, preceding, following, min_periods, *AGG_SUM);
+  };
+
+  // Testing negative preceding.
+  for (auto const preceding : {0, -1, -2, -5, -10, -20, -50}) {
+    auto const results      = rolling_sum(preceding, 100);
+    auto const expected_fun = [&](auto const& i) {
+      assert(preceding < 1);
+      auto const index_in_group = i % 10;
+      auto const start          = std::min(-(preceding - 1) + index_in_group, 10);
+      return int64_t{10 - start};
+    };
+    auto const expected_iter =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0), expected_fun);
+    auto const expected = bigints_column(expected_iter, expected_iter + 30, no_nulls());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+
+  // Testing negative following.
+  for (auto const following : {-1, -2, -5, -10, -20, -50}) {
+    auto const results      = rolling_sum(100, following);
+    auto const expected_fun = [&](auto const& i) {
+      assert(following < 0);
+      auto const index_in_group = i % 10;
+      auto const end            = std::max(index_in_group + following, -1);
+      return int64_t{end + 1};
+    };
+    auto const expected_iter =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0), expected_fun);
+    auto const expected = bigints_column(expected_iter, expected_iter + 30, no_nulls());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
+TEST_F(OffsetRowWindowTest, CheckGroupBoundaries)
+{
+  auto grp_iter =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(0), [](auto const& i) {
+      if (i < 10) return 1;
+      if (i < 20) return 2;
+      return 3;
+    });
+  auto const grp = ints_column(grp_iter, grp_iter + 30);
+  auto const agg = ints_column(grp_iter, grp_iter + 30);
+  {
+    auto const results =
+      cudf::grouped_rolling_window(cudf::table_view{{grp}},
+                                   agg,
+                                   -80,
+                                   100,
+                                   1,
+                                   *cudf::make_max_aggregation<cudf::rolling_aggregation>());
+    auto const null_iter = thrust::make_constant_iterator<int32_t>(null);
+    auto const expected  = ints_column(null_iter, null_iter + 30, all_nulls());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  }
+  {
+    auto const results =
+      cudf::grouped_rolling_window(cudf::table_view{{grp}},
+                                   agg,
+                                   -1,
+                                   4,
+                                   1,
+                                   *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+    auto const expected =
+      ints_column{{1, 1, 1, 1,    1,    1, 1, 1, null, null, 2, 2, 2, 2,    2,
+                   2, 2, 2, null, null, 3, 3, 3, 3,    3,    3, 3, 3, null, null},
+                  nulls_at({8, 9, 18, 19, 28, 29})};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  }
+}
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index e410e2488b3..d0181974479 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -148,20 +148,6 @@ TEST_F(RollingStringTest, MinPeriods)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_all, got_count_all->view());
 }
 
-TEST_F(RollingStringTest, ZeroWindowSize)
-{
-  cudf::test::strings_column_wrapper input(
-    {"This", "is", "rolling", "test", "being", "operated", "on", "string", "column"},
-    {1, 0, 0, 1, 0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_count(
-    {0, 0, 0, 0, 0, 0, 0, 0, 0}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-
-  auto got_count = cudf::rolling_window(
-    input, 0, 0, 0, *cudf::make_count_aggregation<cudf::rolling_aggregation>());
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count, got_count->view());
-}
-
 // =========================================================================================
 class RollingStructTest : public cudf::test::BaseFixture {};
 
@@ -970,6 +956,7 @@ TEST_F(RollingtVarStdTestUntyped, SimpleStaticVarianceStdInfNaN)
 #undef XXX
 }
 
+/*
 // negative sizes
 TYPED_TEST(RollingTest, NegativeWindowSizes)
 {
@@ -980,10 +967,12 @@ TYPED_TEST(RollingTest, NegativeWindowSizes)
   std::vector<cudf::size_type> window{3};
   std::vector<cudf::size_type> negative_window{-2};
 
+
   this->run_test_col_agg(input, negative_window, window, 1);
   this->run_test_col_agg(input, window, negative_window, 1);
   this->run_test_col_agg(input, negative_window, negative_window, 1);
 }
+ */
 
 // simple example from Pandas docs:
 TYPED_TEST(RollingTest, SimpleDynamic)
@@ -1033,6 +1022,7 @@ TYPED_TEST(RollingTest, AllInvalid)
 }
 
 // window = following_window = 0
+// Note: Preceding includes current row, so its value is set to 1.
 TYPED_TEST(RollingTest, ZeroWindow)
 {
   cudf::size_type num_rows = 1000;
@@ -1042,10 +1032,11 @@ TYPED_TEST(RollingTest, ZeroWindow)
 
   cudf::test::fixed_width_column_wrapper<TypeParam, int> input(
     col_data.begin(), col_data.end(), col_mask.begin());
-  std::vector<cudf::size_type> window({0});
+  std::vector<cudf::size_type> preceding({0});
+  std::vector<cudf::size_type> following({1});
   cudf::size_type periods = num_rows;
 
-  this->run_test_col_agg(input, window, window, periods);
+  this->run_test_col_agg(input, preceding, following, periods);
 }
 
 // min_periods = 0

From dcac6cc6a719e2caf1c461be32acd2f7e78308e2 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:04:09 -0500
Subject: [PATCH 196/230] Update image names (#14145)

PR updates `rapidsai/ci` references to `rapidsai/ci-conda`

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14145
---
 .github/workflows/build.yaml | 2 +-
 .github/workflows/pr.yaml    | 6 +++---
 .github/workflows/test.yaml  | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 91ec0904103..0e120d34bb1 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b47a40b13d2..054ea7968c8 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -73,7 +73,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     needs: conda-python-build
@@ -83,7 +83,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -93,7 +93,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-cudf:
     needs: checks
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 6bd2787d6dc..030f2e41db4 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
@@ -63,7 +63,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
@@ -75,7 +75,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit

From f0ba8598dd9792e137ca7aa3a1b22dbb84393cc1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 21 Sep 2023 16:28:29 -0700
Subject: [PATCH 197/230] Pin to numpy<1.25 and numba<0.58 to avoid errors and
 deprecation warnings-as-errors. (#14156)

Closes #14155.

Related: #14160.

(Will newer numpy support be backported to pandas 1.x? edit: no, see below)

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Benjamin Zaitlen (https://github.com/quasiben)
  - Ray Douglass (https://github.com/raydouglass)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14156
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 4 ++--
 conda/environments/all_cuda-120_arch-x86_64.yaml | 4 ++--
 conda/recipes/cudf/meta.yaml                     | 6 ++++--
 dependencies.yaml                                | 8 +++++---
 python/cudf/pyproject.toml                       | 6 +++---
 python/cudf_kafka/pyproject.toml                 | 2 +-
 python/dask_cudf/pyproject.toml                  | 4 ++--
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 692ba78f317..d4abc28cf13 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,8 +55,8 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.57
-- numpy>=1.21
+- numba>=0.57,<0.58
+- numpy>=1.21,<1.25
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==2.6.1
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index cf1bf4b8733..9a98e400e6d 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -54,8 +54,8 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.57
-- numpy>=1.21
+- numba>=0.57,<0.58
+- numpy>=1.21,<1.25
 - numpydoc
 - nvcomp==2.6.1
 - nvtx>=0.2.1
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index a909b72c878..54b687faa69 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -78,8 +78,10 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=1.3,<1.6.0dev0
     - cupy >=12.0.0
-    - numba >=0.57
-    - numpy >=1.21
+    # TODO: Pin to numba<0.58 until #14160 is resolved
+    - numba >=0.57,<0.58
+    # TODO: Pin to numpy<1.25 until cudf requires pandas 2
+    - numpy >=1.21,<1.25
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 398ae193fe6..376e43094a7 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -259,7 +259,8 @@ dependencies:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==12.0.1.*
-          - numpy>=1.21
+          # TODO: Pin to numpy<1.25 until cudf requires pandas 2
+          - &numpy numpy>=1.21,<1.25
   build_python:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -425,14 +426,15 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          - numpy>=1.21
+          - *numpy
           - pandas>=1.3,<1.6.0dev0
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba numba>=0.57
+          # TODO: Pin to numba<0.58 until #14160 is resolved
+          - &numba numba>=0.57,<0.58
           - nvtx>=0.2.1
           - packaging
           - rmm==23.10.*
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 574769f68d1..085d78afc7c 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
     "ninja",
-    "numpy>=1.21",
+    "numpy>=1.21,<1.25",
     "protoc-wheel",
     "pyarrow==12.0.1.*",
     "rmm==23.10.*",
@@ -31,8 +31,8 @@ dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numba>=0.57",
-    "numpy>=1.21",
+    "numba>=0.57,<0.58",
+    "numpy>=1.21,<1.25",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=1.3,<1.6.0dev0",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index a6ef867451b..386cdc32ab1 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -4,7 +4,7 @@
 
 requires = [
     "cython>=3.0.0",
-    "numpy>=1.21",
+    "numpy>=1.21,<1.25",
     "pyarrow==12.0.1.*",
     "setuptools",
     "wheel",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 2464abca71a..922da366422 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "dask>=2023.7.1",
     "distributed>=2023.7.1",
     "fsspec>=0.6.0",
-    "numpy>=1.21",
+    "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -40,7 +40,7 @@ dynamic = ["entry-points"]
 [project.optional-dependencies]
 test = [
     "dask-cuda==23.10.*",
-    "numba>=0.57",
+    "numba>=0.57,<0.58",
     "pytest",
     "pytest-cov",
     "pytest-xdist",

From dd58dc4e9dae387c878afbe6cb32a311ce76fe68 Mon Sep 17 00:00:00 2001
From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com>
Date: Fri, 22 Sep 2023 07:58:56 -0500
Subject: [PATCH 198/230] Remove outdated GitHub project actions (#14161)

This PR removes two GitHub Actions that are no-longer needed:
- `.github/workflows/add_to_project.yml`
   - This automatically adds issues and PRs to the cuDF/Dask/Numba/UCX project, but this is now a built-in functionality to projects
- `.github/workflows/new-issues-to-triage-projects.yml`
   - This tries to add issues to a now closed project

Authors:
   - Ben Jarmak (https://github.com/jarmak-nv)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 .github/workflows/add_to_project.yml          | 20 -----------
 .../new-issues-to-triage-projects.yml         | 35 -------------------
 2 files changed, 55 deletions(-)
 delete mode 100644 .github/workflows/add_to_project.yml
 delete mode 100644 .github/workflows/new-issues-to-triage-projects.yml

diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml
deleted file mode 100644
index b301c56a999..00000000000
--- a/.github/workflows/add_to_project.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: Add new issue/PR to project
-
-on:
-  issues:
-    types:
-      - opened
-
-  pull_request_target:
-    types:
-      - opened
-
-jobs:
-  add-to-project:
-    name: Add issue or PR to project
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/add-to-project@v0.3.0
-        with:
-          project-url: https://github.com/orgs/rapidsai/projects/51
-          github-token: ${{ secrets.ADD_TO_PROJECT_GITHUB_TOKEN }}
diff --git a/.github/workflows/new-issues-to-triage-projects.yml b/.github/workflows/new-issues-to-triage-projects.yml
deleted file mode 100644
index cf9b0c379f1..00000000000
--- a/.github/workflows/new-issues-to-triage-projects.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: Auto Assign New Issues to Triage Project
-
-on:
-  issues:
-    types: [opened]
-
-env:
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-jobs:
-  assign_one_project:
-    runs-on: ubuntu-latest
-    name: Assign to New Issues to Triage Project
-    steps:
-    - name: Process bug issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, 'bug') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/1
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
-    - name: Process feature issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, 'feature request') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/9
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
-    - name: Process other issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, '? - Needs Triage') && (!contains(github.event.issue.labels.*.name, 'bug') && !contains(github.event.issue.labels.*.name, 'feature request'))
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/10
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'

From 98b1bc6c1ef1233a6c71c3b24fc8f88d591a4639 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:07:37 -0400
Subject: [PATCH 199/230] Fix calls to copy_bitmask to pass stream parameter
 (#14158)

Fixes a couple places where `cudf::copy_bitmask` was called instead of `cudf::detail::copy_bitmask` to pass the available stream (and mr) parameters.

Found while reviewing #14121
Reference: https://github.com/rapidsai/cudf/pull/14121#discussion_r1332332391

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14158
---
 cpp/src/lists/count_elements.cu | 12 ++++++------
 cpp/src/replace/clamp.cu        |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index f8e7b4c6126..40a14d805e1 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,12 +36,12 @@ namespace cudf {
 namespace lists {
 namespace detail {
 /**
- * @brief Returns a numeric column containing lengths of each element.
+ * @brief Returns a numeric column containing lengths of each element
  *
- * @param input Input lists column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param input Input lists column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT32 column with lengths.
+ * @return New size_type column with lengths
  */
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
@@ -52,7 +52,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
   // create output column
   auto output = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                         input.size(),
-                                        copy_bitmask(input.parent()),
+                                        cudf::detail::copy_bitmask(input.parent(), stream, mr),
                                         input.null_count(),
                                         stream,
                                         mr);
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 2b48aed2d29..950cb484ddf 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -163,7 +163,9 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   auto output =
     detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
   // mask will not change
-  if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); }
+  if (input.nullable()) {
+    output->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+  }
 
   auto output_device_view =
     cudf::mutable_column_device_view::create(output->mutable_view(), stream);

From f865c871cd0f9b9c596476d9d98aafaf9cc46bb1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:08:11 -0400
Subject: [PATCH 200/230] Expose stream parameter in public nvtext ngram APIs
 (#14061)

Add stream parameter to public APIs:

- `nvtext::generate_ngrams()`
- `nvtext::generate_character_ngrams()`
- `nvtext::hash_character_ngrams()`
- `nvtext::ngrams_tokenize()`

Also cleaned up some of the doxygen comments.
And also fixed a spelling mistake in the jaccard.cu source that was bothering me.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14061
---
 cpp/benchmarks/text/ngrams.cpp           |  3 +-
 cpp/benchmarks/text/tokenize.cpp         |  7 ++-
 cpp/include/nvtext/generate_ngrams.hpp   | 38 ++++++++-------
 cpp/include/nvtext/ngrams_tokenize.hpp   | 28 +++++------
 cpp/src/text/generate_ngrams.cu          |  9 ++--
 cpp/src/text/jaccard.cu                  |  4 +-
 cpp/src/text/ngrams_tokenize.cu          |  4 +-
 cpp/tests/CMakeLists.txt                 |  1 +
 cpp/tests/streams/text/ngrams_test.cpp   | 59 ++++++++++++++++++++++++
 cpp/tests/text/ngrams_tests.cpp          | 28 ++++++-----
 cpp/tests/text/ngrams_tokenize_tests.cpp | 11 +++--
 11 files changed, 135 insertions(+), 57 deletions(-)
 create mode 100644 cpp/tests/streams/text/ngrams_test.cpp

diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 0319577f6b9..f3fd5cc5729 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
   cudf::strings_column_view input(column->view());
+  auto const separator = cudf::string_scalar("_");
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
     switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
+      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
       case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
     }
   }
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index 423fe667b05..b556a84c541 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state)
       auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
     });
   } else if (tokenize_type == "ngrams") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); });
+    auto const delimiter = cudf::string_scalar("");
+    auto const separator = cudf::string_scalar("_");
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator);
+    });
   } else if (tokenize_type == "characters") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); });
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 5d66401df9d..46f2c0e7bc9 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -47,19 +47,19 @@ namespace nvtext {
  * @throw cudf::logic_error if `separator` is invalid
  * @throw cudf::logic_error if there are not enough strings to generate any ngrams
  *
- * @param strings Strings column to tokenize and produce ngrams from.
- * @param ngrams The ngram number to generate.
- *               Default is 2 = bigram.
- * @param separator The string to use for separating ngram tokens.
- *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize and produce ngrams from
+ * @param ngrams The ngram number to generate
+ * @param separator The string to use for separating ngram tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_ngrams(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  cudf::strings_column_view const& input,
+  cudf::size_type ngrams,
+  cudf::string_scalar const& separator,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generates ngrams of characters within each string.
@@ -79,15 +79,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 2,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -113,14 +115,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 5,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 17f20f7ea4c..9d76ef8689f 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,22 +66,22 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize and produce ngrams from.
- * @param ngrams The ngram number to generate.
- *               Default is 2 = bigram.
+ * @param input Strings column to tokenize and produce ngrams from
+ * @param ngrams The ngram number to generate
  * @param delimiter UTF-8 characters used to separate each string into tokens.
- *                  The default of empty string will separate tokens using whitespace.
- * @param separator The string to use for separating ngram tokens.
- *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ *                  An empty string will separate tokens using whitespace.
+ * @param separator The string to use for separating ngram tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> ngrams_tokenize(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  cudf::strings_column_view const& input,
+  cudf::size_type ngrams,
+  cudf::string_scalar const& delimiter,
+  cudf::string_scalar const& separator,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 938fd45246d..5f2f4d021a4 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
+  return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
 }
 
 namespace detail {
@@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
+                                                        rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::generate_character_ngrams(strings, ngrams, stream, mr);
 }
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 5b55745c2c7..95324847ea0 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
  *
  * This is called with a warp per row
  */
-struct sorted_interset_fn {
+struct sorted_intersect_fn {
   cudf::column_device_view const d_input1;
   cudf::column_device_view const d_input2;
   cudf::size_type* d_results;
@@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
   auto const d_input1 = cudf::column_device_view::create(input1, stream);
   auto const d_input2 = cudf::column_device_view::create(input2, stream);
   auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
+  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::counting_iterator<cudf::size_type>(0),
                      input1.size() * cudf::detail::warp_size,
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index fd1cbf99221..73d85513e95 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ngrams_tokenize(
-    strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
+  return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d1e50442058..ba4921848d7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -632,6 +632,7 @@ ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp
new file mode 100644
index 00000000000..bce0d2b680b
--- /dev/null
+++ b/cpp/tests/streams/text/ngrams_test.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/generate_ngrams.hpp>
+#include <nvtext/ngrams_tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextNGramsTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextNGramsTest, GenerateNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::generate_ngrams(
+    cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, GenerateCharacterNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  nvtext::generate_character_ngrams(
+    cudf::strings_column_view(input), 3, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, HashCharacterNgrams)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  nvtext::hash_character_ngrams(
+    cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, NgramsTokenize)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::ngrams_tokenize(
+    cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index 323b3eed3e2..7b179588385 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
 {
   cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"};
   cudf::strings_column_view strings_view(strings);
+  auto const separator = cudf::string_scalar("_");
 
   {
     cudf::test::strings_column_wrapper expected{
       "the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"};
-    auto const results = nvtext::generate_ngrams(strings_view);
+    auto const results = nvtext::generate_ngrams(strings_view, 2, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 
   {
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"};
-    auto const results = nvtext::generate_ngrams(strings_view, 3);
+    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto const separator = cudf::string_scalar("_");
 
   cudf::strings_column_view strings_view(strings);
   {
-    auto const results = nvtext::generate_ngrams(strings_view, 3);
+    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
 
-  auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
+  auto const separator = cudf::string_scalar("_");
+
+  auto results =
+    nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator);
   cudf::test::expect_column_empty(results->view());
   results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));
   cudf::test::expect_column_empty(results->view());
@@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty)
 TEST_F(TextGenerateNgramsTest, Errors)
 {
   cudf::test::strings_column_wrapper strings{""};
+  auto const separator = cudf::string_scalar("_");
   // invalid parameter value
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
+               cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
                cudf::logic_error);
   // not enough strings to generate ngrams
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
+               cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3),
                cudf::logic_error);
 
-  std::vector<char const*> h_strings{"", nullptr, "", nullptr};
-  cudf::test::strings_column_wrapper strings_no_tokens(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)),
+  cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0});
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator),
                cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)),
                cudf::logic_error);
diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp
index 5879bec3e64..c6fb886f7e5 100644
--- a/cpp/tests/text/ngrams_tokenize_tests.cpp
+++ b/cpp/tests/text/ngrams_tokenize_tests.cpp
@@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize)
                                                 "mousé_ate",
                                                 "ate_the",
                                                 "the_cheese"};
-    auto results = nvtext::ngrams_tokenize(strings_view);
+    auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -101,9 +101,10 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram)
 {
   cudf::test::strings_column_wrapper strings{"aaa bbb", "  ccc  ddd  ", "eee"};
   cudf::strings_column_view strings_view(strings);
+  auto const empty = cudf::string_scalar("");
 
   cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"};
-  auto results = nvtext::ngrams_tokenize(strings_view, 1);
+  auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -111,7 +112,8 @@ TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest)
 {
   auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
   cudf::strings_column_view strings_view(strings->view());
-  auto results = nvtext::ngrams_tokenize(strings_view);
+  auto const empty = cudf::string_scalar("");
+  auto results     = nvtext::ngrams_tokenize(strings_view, 2, empty, empty);
   EXPECT_EQ(results->size(), 0);
   EXPECT_EQ(results->has_nulls(), false);
 }
@@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest)
 {
   cudf::test::strings_column_wrapper strings{"this column intentionally left blank"};
   cudf::strings_column_view strings_view(strings);
-  EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error);
+  auto const empty = cudf::string_scalar("");
+  EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error);
 }

From a6d014e632ecad86cef486402dbe53acee191a1d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 22 Sep 2023 16:24:33 +0100
Subject: [PATCH 201/230] Support callables in DataFrame.assign (#14142)

While here, change the way the initial copied frame is constructed:
callables are allowed to refer to columns already in the dataframe,
even if they overwrite them.

- Closes #12936

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14142
---
 python/cudf/cudf/core/dataframe.py       | 23 ++++++++++++++---------
 python/cudf/cudf/tests/test_dataframe.py | 19 +++++++++++++++++++
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1a780cc9e9f..8a3dbe77787 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1390,10 +1390,21 @@ def _get_numeric_data(self):
         return self[columns]
 
     @_cudf_nvtx_annotate
-    def assign(self, **kwargs):
+    def assign(self, **kwargs: Union[Callable[[Self], Any], Any]):
         """
         Assign columns to DataFrame from keyword arguments.
 
+        Parameters
+        ----------
+        **kwargs: dict mapping string column names to values
+            The value for each key can either be a literal column (or
+            something that can be converted to a column), or
+            a callable of one argument that will be given the
+            dataframe as an argument and should return the new column
+            (without modifying the input argument).
+            Columns are added in-order, so callables can refer to
+            column names constructed in the assignment.
+
         Examples
         --------
         >>> import cudf
@@ -1405,15 +1416,9 @@ def assign(self, **kwargs):
         1  1  4
         2  2  5
         """
-        new_df = cudf.DataFrame(index=self.index.copy())
-        for name, col in self._data.items():
-            if name in kwargs:
-                new_df[name] = kwargs.pop(name)
-            else:
-                new_df._data[name] = col.copy()
-
+        new_df = self.copy(deep=False)
         for k, v in kwargs.items():
-            new_df[k] = v
+            new_df[k] = v(new_df) if callable(v) else v
         return new_df
 
     @classmethod
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6180162ecdd..2f531afdeb7 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1327,6 +1327,25 @@ def test_assign():
     np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4])
 
 
+@pytest.mark.parametrize(
+    "mapping",
+    [
+        {"y": 1, "z": lambda df: df["x"] + df["y"]},
+        {
+            "x": lambda df: df["x"] * 2,
+            "y": lambda df: 2,
+            "z": lambda df: df["x"] / df["y"],
+        },
+    ],
+)
+def test_assign_callable(mapping):
+    df = pd.DataFrame({"x": [1, 2, 3]})
+    cdf = cudf.from_pandas(df)
+    expect = df.assign(**mapping)
+    actual = cdf.assign(**mapping)
+    assert_eq(expect, actual)
+
+
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
 @pytest.mark.parametrize("method", ["murmur3", "md5"])
 @pytest.mark.parametrize("seed", [None, 42])

From 40bdd8ae4d89d2ea1f466c579d56f2c9ca1b014d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 22 Sep 2023 19:20:18 +0200
Subject: [PATCH 202/230] Pin to `aws-sdk-cpp<1.11` (#14173)

Pin conda packages to `aws-sdk-cpp<1.11`. The recent upgrade in version `1.11.*` has caused several issues with cleaning up (more details on changes can be read in [this link](https://github.com/aws/aws-sdk-cpp#version-111-is-now-available)), leading to Distributed and Dask-CUDA processes to segfault. The stack for one of those crashes looks like the following:

```
(gdb) bt
#0  0x00007f5125359a0c in Aws::Utils::Logging::s_aws_logger_redirect_get_log_level(aws_logger*, unsigned int) () from /opt/conda/envs/dask/lib/python3.9/site-packages/pyarrow/../../.././libaws-cpp-sdk-core.so
#1  0x00007f5124968f83 in aws_event_loop_thread () from /opt/conda/envs/dask/lib/python3.9/site-packages/pyarrow/../../../././libaws-c-io.so.1.0.0
#2  0x00007f5124ad9359 in thread_fn () from /opt/conda/envs/dask/lib/python3.9/site-packages/pyarrow/../../../././libaws-c-common.so.1
#3  0x00007f519958f6db in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
#4  0x00007f5198b1361f in clone () from /lib/x86_64-linux-gnu/libc.so.6
```

Such segfaults now manifest frequently in CI, and in some cases are reproducible with a hit rate of ~30%. Given the approaching release time, it's probably the safest option to just pin to an older version of the package while we don't pinpoint the exact cause for the issue and a patched build is released upstream.

The `aws-sdk-cpp` is statically-linked in the `pyarrow` pip package, which prevents us from using the same pinning technique. cuDF is currently pinned to `pyarrow=12.0.1` which seems to be built against `aws-sdk-cpp=1.10.*`, as per [recent build logs](https://github.com/apache/arrow/actions/runs/6276453828/job/17046177335?pr=37792#step:6:1372).

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14173
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-120_arch-x86_64.yaml | 1 +
 conda/recipes/libcudf/conda_build_config.yaml    | 3 +++
 conda/recipes/libcudf/meta.yaml                  | 2 ++
 dependencies.yaml                                | 1 +
 5 files changed, 8 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d4abc28cf13..9fb991f9075 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,6 +9,7 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 9a98e400e6d..9ba0dd8dc38 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,6 +9,7 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 25b3f19de77..b1f5b083e06 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -22,6 +22,9 @@ gbench_version:
 gtest_version:
   - ">=1.13.0"
 
+aws_sdk_cpp_version:
+  - "<1.11"
+
 libarrow_version:
   - "=12"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 627065817ba..28357f0d96d 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -74,6 +74,7 @@ requirements:
     - gtest {{ gtest_version }}
     - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
+    - aws-sdk-cpp {{ aws_sdk_cpp_version }}
 
 outputs:
   - name: libcudf
@@ -107,6 +108,7 @@ outputs:
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
+        - aws-sdk-cpp {{ aws_sdk_cpp_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/dependencies.yaml b/dependencies.yaml
index 376e43094a7..5586f54348c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -218,6 +218,7 @@ dependencies:
           - libkvikio==23.10.*
       - output_types: conda
         packages:
+          - aws-sdk-cpp<1.11
           - fmt>=9.1.0,<10
           - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0

From c7dd6b48684028a65b1d19d5d5b04060f6a4fe19 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:15:31 -0400
Subject: [PATCH 203/230] Refactor libcudf indexalator to typed normalator
 (#14043)

Creates generic normalizing-iterator for integer types for use by the `indexalator` and the future offsets normalizing iterator.
Mostly code has been moved around or renamed so the normalizing-iterator part can take type template parameter to identify which integer type to normalize to. For the `indexalator`, this type is `cudf::size_type` and for the offsets iterator this type would be `int64`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14043
---
 cpp/include/cudf/detail/indexalator.cuh       | 332 +---------------
 .../cudf/detail/normalizing_iterator.cuh      | 367 ++++++++++++++++++
 2 files changed, 374 insertions(+), 325 deletions(-)
 create mode 100644 cpp/include/cudf/detail/normalizing_iterator.cuh

diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 4731c4919e3..6532dae3695 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -16,14 +16,13 @@
 
 #pragma once
 
+#include <cudf/detail/normalizing_iterator.cuh>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
@@ -32,193 +31,6 @@
 namespace cudf {
 namespace detail {
 
-/**
- * @brief The base class for the input or output index normalizing iterator.
- *
- * This implementation uses CRTP to define the `input_indexalator` and the
- * `output_indexalator` classes. This is so this class can manipulate the
- * uniquely typed subclass member variable `p_` directly without requiring
- * virtual functions since iterator instances will be copied to device memory.
- *
- * The base class mainly manages updating the `p_` member variable while the
- * subclasses handle accessing individual elements in device memory.
- *
- * @tparam T The derived class type for the iterator.
- */
-template <class T>
-struct base_indexalator {
-  using difference_type   = ptrdiff_t;
-  using value_type        = size_type;
-  using pointer           = size_type*;
-  using iterator_category = std::random_access_iterator_tag;
-
-  base_indexalator()                                   = default;
-  base_indexalator(base_indexalator const&)            = default;
-  base_indexalator(base_indexalator&&)                 = default;
-  base_indexalator& operator=(base_indexalator const&) = default;
-  base_indexalator& operator=(base_indexalator&&)      = default;
-
-  /**
-   * @brief Prefix increment operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator++()
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ += width_;
-    return derived;
-  }
-
-  /**
-   * @brief Postfix increment operator.
-   */
-  CUDF_HOST_DEVICE inline T operator++(int)
-  {
-    T tmp{static_cast<T&>(*this)};
-    operator++();
-    return tmp;
-  }
-
-  /**
-   * @brief Prefix decrement operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator--()
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ -= width_;
-    return derived;
-  }
-
-  /**
-   * @brief Postfix decrement operator.
-   */
-  CUDF_HOST_DEVICE inline T operator--(int)
-  {
-    T tmp{static_cast<T&>(*this)};
-    operator--();
-    return tmp;
-  }
-
-  /**
-   * @brief Compound assignment by sum operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator+=(difference_type offset)
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ += offset * width_;
-    return derived;
-  }
-
-  /**
-   * @brief Increment by offset operator.
-   */
-  CUDF_HOST_DEVICE inline T operator+(difference_type offset) const
-  {
-    auto tmp = T{static_cast<T const&>(*this)};
-    tmp.p_ += (offset * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Addition assignment operator.
-   */
-  CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs)
-  {
-    T tmp{rhs};
-    tmp.p_ += (offset * rhs.width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Compound assignment by difference operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator-=(difference_type offset)
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ -= offset * width_;
-    return derived;
-  }
-
-  /**
-   * @brief Decrement by offset operator.
-   */
-  CUDF_HOST_DEVICE inline T operator-(difference_type offset) const
-  {
-    auto tmp = T{static_cast<T const&>(*this)};
-    tmp.p_ -= (offset * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Subtraction assignment operator.
-   */
-  CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs)
-  {
-    T tmp{rhs};
-    tmp.p_ -= (offset * rhs.width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Compute offset from iterator difference operator.
-   */
-  CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const
-  {
-    return (static_cast<T const&>(*this).p_ - rhs.p_) / width_;
-  }
-
-  /**
-   * @brief Equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const
-  {
-    return rhs.p_ == static_cast<T const&>(*this).p_;
-  }
-  /**
-   * @brief Not equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const
-  {
-    return rhs.p_ != static_cast<T const&>(*this).p_;
-  }
-  /**
-   * @brief Less than operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ < rhs.p_;
-  }
-  /**
-   * @brief Greater than operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ > rhs.p_;
-  }
-  /**
-   * @brief Less than or equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ <= rhs.p_;
-  }
-  /**
-   * @brief Greater than or equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ >= rhs.p_;
-  }
-
- protected:
-  /**
-   * @brief Constructor assigns width and type member variables for base class.
-   */
-  base_indexalator(int32_t width, data_type dtype) : width_(width), dtype_(dtype) {}
-
-  int width_;        /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
 /**
  * @brief The index normalizing input iterator.
  *
@@ -244,65 +56,7 @@ struct base_indexalator {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-struct input_indexalator : base_indexalator<input_indexalator> {
-  friend struct indexalator_factory;
-  friend struct base_indexalator<input_indexalator>;  // for CRTP
-
-  using reference = size_type const;  // this keeps STL and thrust happy
-
-  input_indexalator()                                    = default;
-  input_indexalator(input_indexalator const&)            = default;
-  input_indexalator(input_indexalator&&)                 = default;
-  input_indexalator& operator=(input_indexalator const&) = default;
-  input_indexalator& operator=(input_indexalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position.
-   */
-  __device__ inline size_type operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a size_type value from any index type.
-   */
-  struct index_as_size_type {
-    template <typename T, std::enable_if_t<is_index_type<T>()>* = nullptr>
-    __device__ size_type operator()(void const* tp)
-    {
-      return static_cast<size_type>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
-    __device__ size_type operator()(void const* tp)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-  /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `size_type` value.
-   */
-  __device__ inline size_type operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * width_);
-    return type_dispatcher(dtype_, index_as_size_type{}, tp);
-  }
-
- protected:
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param width     The width of the integer type (1, 2, 4, or 8)
-   * @param data_type Index integer type of width `width`
-   */
-  input_indexalator(void const* data, int width, data_type dtype)
-    : base_indexalator<input_indexalator>(width, dtype), p_{static_cast<char const*>(data)}
-  {
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
+using input_indexalator = input_normalator<cudf::size_type>;
 
 /**
  * @brief The index normalizing output iterator.
@@ -328,79 +82,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
  *                      thrust::less<Element>());
  * @endcode
  */
-struct output_indexalator : base_indexalator<output_indexalator> {
-  friend struct indexalator_factory;
-  friend struct base_indexalator<output_indexalator>;  // for CRTP
-
-  using reference = output_indexalator const&;  // required for output iterators
-
-  output_indexalator()                                     = default;
-  output_indexalator(output_indexalator const&)            = default;
-  output_indexalator(output_indexalator&&)                 = default;
-  output_indexalator& operator=(output_indexalator const&) = default;
-  output_indexalator& operator=(output_indexalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(size_type)` calls.
-   */
-  __device__ inline output_indexalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(size_type)` call in this class.
-   */
-  __device__ inline output_indexalator const operator[](size_type idx) const
-  {
-    output_indexalator tmp{*this};
-    tmp.p_ += (idx * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct size_type_to_index {
-    template <typename T, std::enable_if_t<is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, size_type const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, size_type const value)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-
-  /**
-   * @brief Assign a size_type value to the current iterator position.
-   */
-  __device__ inline output_indexalator const& operator=(size_type const value) const
-  {
-    void* tp = p_;
-    type_dispatcher(dtype_, size_type_to_index{}, tp, value);
-    return *this;
-  }
-
- protected:
-  /**
-   * @brief Create an output index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param width     The width of the integer type (1, 2, 4, or 8)
-   * @param data_type Index integer type of width `width`
-   */
-  output_indexalator(void* data, int width, data_type dtype)
-    : base_indexalator<output_indexalator>(width, dtype), p_{static_cast<char*>(data)}
-  {
-  }
-
-  char* p_;  /// pointer to the integer data in device memory
-};
+using output_indexalator = output_normalator<cudf::size_type>;
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -413,7 +95,7 @@ struct indexalator_factory {
     template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
     input_indexalator operator()(column_view const& indices)
     {
-      return input_indexalator(indices.data<IndexType>(), sizeof(IndexType), indices.type());
+      return input_indexalator(indices.data<IndexType>(), indices.type());
     }
     template <typename IndexType,
               typename... Args,
@@ -433,7 +115,7 @@ struct indexalator_factory {
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
-      return input_indexalator(scalar_impl->data(), sizeof(IndexType), index.type());
+      return input_indexalator(scalar_impl->data(), index.type());
     }
     template <typename IndexType,
               typename... Args,
@@ -451,7 +133,7 @@ struct indexalator_factory {
     template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
     output_indexalator operator()(mutable_column_view const& indices)
     {
-      return output_indexalator(indices.data<IndexType>(), sizeof(IndexType), indices.type());
+      return output_indexalator(indices.data<IndexType>(), indices.type());
     }
     template <typename IndexType,
               typename... Args,
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
new file mode 100644
index 00000000000..51b3133f84f
--- /dev/null
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The base class for the input or output normalizing iterator
+ *
+ * The base class mainly manages updating the `p_` member variable while the
+ * subclasses handle accessing individual elements in device memory.
+ *
+ * @tparam Derived The derived class type for the iterator
+ * @tparam Integer The type the iterator normalizes to
+ */
+template <class Derived, typename Integer>
+struct base_normalator {
+  static_assert(std::is_integral_v<Integer>);
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = Integer;
+  using pointer           = Integer*;
+  using iterator_category = std::random_access_iterator_tag;
+
+  base_normalator()                                  = default;
+  base_normalator(base_normalator const&)            = default;
+  base_normalator(base_normalator&&)                 = default;
+  base_normalator& operator=(base_normalator const&) = default;
+  base_normalator& operator=(base_normalator&&)      = default;
+
+  /**
+   * @brief Prefix increment operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator++()
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ += width_;
+    return derived;
+  }
+
+  /**
+   * @brief Postfix increment operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator++(int)
+  {
+    Derived tmp{static_cast<Derived&>(*this)};
+    operator++();
+    return tmp;
+  }
+
+  /**
+   * @brief Prefix decrement operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator--()
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ -= width_;
+    return derived;
+  }
+
+  /**
+   * @brief Postfix decrement operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator--(int)
+  {
+    Derived tmp{static_cast<Derived&>(*this)};
+    operator--();
+    return tmp;
+  }
+
+  /**
+   * @brief Compound assignment by sum operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator+=(difference_type offset)
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ += offset * width_;
+    return derived;
+  }
+
+  /**
+   * @brief Increment by offset operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator+(difference_type offset) const
+  {
+    auto tmp = Derived{static_cast<Derived const&>(*this)};
+    tmp.p_ += (offset * width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Addition assignment operator.
+   */
+  CUDF_HOST_DEVICE inline friend Derived operator+(difference_type offset, Derived const& rhs)
+  {
+    Derived tmp{rhs};
+    tmp.p_ += (offset * rhs.width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Compound assignment by difference operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator-=(difference_type offset)
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ -= offset * width_;
+    return derived;
+  }
+
+  /**
+   * @brief Decrement by offset operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator-(difference_type offset) const
+  {
+    auto tmp = Derived{static_cast<Derived const&>(*this)};
+    tmp.p_ -= (offset * width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Subtraction assignment operator.
+   */
+  CUDF_HOST_DEVICE inline friend Derived operator-(difference_type offset, Derived const& rhs)
+  {
+    Derived tmp{rhs};
+    tmp.p_ -= (offset * rhs.width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Compute offset from iterator difference operator.
+   */
+  CUDF_HOST_DEVICE inline difference_type operator-(Derived const& rhs) const
+  {
+    return (static_cast<Derived const&>(*this).p_ - rhs.p_) / width_;
+  }
+
+  /**
+   * @brief Equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator==(Derived const& rhs) const
+  {
+    return rhs.p_ == static_cast<Derived const&>(*this).p_;
+  }
+
+  /**
+   * @brief Not equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator!=(Derived const& rhs) const
+  {
+    return rhs.p_ != static_cast<Derived const&>(*this).p_;
+  }
+
+  /**
+   * @brief Less than operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator<(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ < rhs.p_;
+  }
+
+  /**
+   * @brief Greater than operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator>(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ > rhs.p_;
+  }
+
+  /**
+   * @brief Less than or equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator<=(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ <= rhs.p_;
+  }
+
+  /**
+   * @brief Greater than or equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator>=(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ >= rhs.p_;
+  }
+
+ protected:
+  /**
+   * @brief Constructor assigns width and type member variables for base class.
+   */
+  explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {}
+
+  int width_;        /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
+};
+
+/**
+ * @brief The integer normalizing input iterator
+ *
+ * This is an iterator that can be used for index types (integers) without
+ * requiring a type-specific instance. It can be used for any iterator
+ * interface for reading an array of integer values of type
+ * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
+ * Reading specific elements always return a type of `Integer`
+ *
+ * @tparam Integer Type returned by all read functions
+ */
+template <typename Integer>
+struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
+  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
+
+  using reference = Integer const;  // this keeps STL and thrust happy
+
+  input_normalator()                                   = default;
+  input_normalator(input_normalator const&)            = default;
+  input_normalator(input_normalator&&)                 = default;
+  input_normalator& operator=(input_normalator const&) = default;
+  input_normalator& operator=(input_normalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline Integer operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
+    __device__ Integer operator()(void const* tp)
+    {
+      return static_cast<Integer>(*static_cast<T const*>(tp));
+    }
+    template <typename T, std::enable_if_t<not cuda::std::is_integral_v<T>>* = nullptr>
+    __device__ Integer operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline Integer operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param data_type Type of data in data
+   */
+  input_normalator(void const* data, data_type dtype)
+    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
+  {
+  }
+
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The integer normalizing output iterator
+ *
+ * This is an iterator that can be used for index types (integers) without
+ * requiring a type-specific instance. It can be used for any iterator
+ * interface for writing an array of integer values of type
+ * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
+ * Setting specific elements always accept the `Integer` type values.
+ *
+ * @tparam Integer The type used for all write functions
+ */
+template <typename Integer>
+struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
+  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
+
+  using reference = output_normalator const&;  // required for output iterators
+
+  output_normalator()                                    = default;
+  output_normalator(output_normalator const&)            = default;
+  output_normalator(output_normalator&&)                 = default;
+  output_normalator& operator=(output_normalator const&) = default;
+  output_normalator& operator=(output_normalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline output_normalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_normalator const operator[](size_type idx) const
+  {
+    output_normalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+    __device__ void operator()(void* tp, Integer const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+    __device__ void operator()(void*, Integer const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline output_normalator const& operator=(Integer const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param data_type Type of data in data
+   */
+  output_normalator(void* data, data_type dtype)
+    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf

From 517d1239c913c86f7c1d9dc6642434e73aa2b14c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 22 Sep 2023 12:40:09 -0700
Subject: [PATCH 204/230] Expose streams in all public sorting APIs (#14146)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14146
---
 cpp/include/cudf/sorting.hpp          |  44 ++++++---
 cpp/src/lists/segmented_sort.cu       |  30 +++---
 cpp/src/sort/is_sorted.cu             |   5 +-
 cpp/src/sort/rank.cu                  |  11 +--
 cpp/src/sort/segmented_sort.cu        |   8 +-
 cpp/src/sort/segmented_sort_impl.cuh  |   2 +-
 cpp/src/sort/sort.cu                  |  10 +-
 cpp/src/sort/stable_segmented_sort.cu |   8 +-
 cpp/src/sort/stable_sort.cu           |   8 +-
 cpp/tests/CMakeLists.txt              |   7 +-
 cpp/tests/streams/sorting_test.cpp    | 132 ++++++++++++++++++++++++++
 11 files changed, 210 insertions(+), 55 deletions(-)
 create mode 100644 cpp/tests/streams/sorting_test.cpp

diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 6924e77ae9b..e4e803b2d3c 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -43,6 +44,7 @@ namespace cudf {
  * @param null_precedence The desired order of null compared to other elements
  * for each column. Size must be equal to `input.num_columns()` or empty.
  * If empty, all columns will be sorted in `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A non-nullable column of elements containing the permuted row indices of
  * `input` if it were sorted
@@ -51,6 +53,7 @@ std::unique_ptr<column> sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -65,27 +68,30 @@ std::unique_ptr<column> stable_sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
  *        order.
  *
- * @param[in] table             Table whose rows need to be compared for ordering
- * @param[in] column_order      The expected sort order for each column. Size
- *                              must be equal to `in.num_columns()` or empty. If
- *                              empty, it is expected all columns are in
- *                              ascending order.
- * @param[in] null_precedence   The desired order of null compared to other
- *                              elements for each column. Size must be equal to
- *                              `input.num_columns()` or empty. If empty,
- *                              `null_order::BEFORE` is assumed for all columns.
- *
- * @returns bool                true if sorted as expected, false if not
+ * @param table             Table whose rows need to be compared for ordering
+ * @param column_order      The expected sort order for each column. Size
+ *                          must be equal to `in.num_columns()` or empty. If
+ *                          empty, it is expected all columns are in
+ *                          ascending order.
+ * @param null_precedence   The desired order of null compared to other
+ *                          elements for each column. Size must be equal to
+ *                          `input.num_columns()` or empty. If empty,
+ *                          `null_order::BEFORE` is assumed for all columns.
+ *
+ * @param stream            CUDA stream used for device memory operations and kernel launches
+ * @returns                 true if sorted as expected, false if not
  */
 bool is_sorted(cudf::table_view const& table,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence);
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Performs a lexicographic sort of the rows of a table
@@ -98,6 +104,7 @@ bool is_sorted(cudf::table_view const& table,
  * elements for each column in `input`. Size must be equal to
  * `input.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return New table containing the desired sorted order of `input`
  */
@@ -105,6 +112,7 @@ std::unique_ptr<table> sort(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -124,6 +132,7 @@ std::unique_ptr<table> sort(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The reordering of `values` determined by the lexicographic order of
  * the rows of `keys`.
@@ -133,6 +142,7 @@ std::unique_ptr<table> sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -154,6 +164,7 @@ std::unique_ptr<table> sort_by_key(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The reordering of `values` determined by the lexicographic order of
  * the rows of `keys`.
@@ -163,6 +174,7 @@ std::unique_ptr<table> stable_sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,6 +201,7 @@ std::unique_ptr<table> stable_sort_by_key(
  * @param null_precedence The desired order of null compared to other elements
  * for column
  * @param percentage flag to convert ranks to percentage in range (0,1]
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A column of containing the rank of the each element of the column of `input`. The output
  * column type will be `size_type`column by default or else `double` when
@@ -201,6 +214,7 @@ std::unique_ptr<column> rank(
   null_policy null_handling,
   null_order null_precedence,
   bool percentage,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -241,6 +255,7 @@ std::unique_ptr<column> rank(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return sorted order of the segment sorted table
  *
@@ -250,6 +265,7 @@ std::unique_ptr<column> segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -262,6 +278,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -306,6 +323,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return table with elements in each segment sorted
  *
@@ -316,6 +334,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -329,6 +348,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 260636a61cf..49054ebb046 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,13 +70,13 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
   auto output_offset = build_output_offsets(input, stream, mr);
   auto const child   = input.get_sliced_child(stream);
 
-  auto const sorted_child_table = segmented_sort_by_key(table_view{{child}},
-                                                        table_view{{child}},
-                                                        output_offset->view(),
-                                                        {column_order},
-                                                        {null_precedence},
-                                                        stream,
-                                                        mr);
+  auto const sorted_child_table = cudf::detail::segmented_sort_by_key(table_view{{child}},
+                                                                      table_view{{child}},
+                                                                      output_offset->view(),
+                                                                      {column_order},
+                                                                      {null_precedence},
+                                                                      stream,
+                                                                      mr);
 
   return make_lists_column(input.size(),
                            std::move(output_offset),
@@ -98,13 +98,13 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
   auto output_offset = build_output_offsets(input, stream, mr);
   auto const child   = input.get_sliced_child(stream);
 
-  auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}},
-                                                               table_view{{child}},
-                                                               output_offset->view(),
-                                                               {column_order},
-                                                               {null_precedence},
-                                                               stream,
-                                                               mr);
+  auto const sorted_child_table = cudf::detail::stable_segmented_sort_by_key(table_view{{child}},
+                                                                             table_view{{child}},
+                                                                             output_offset->view(),
+                                                                             {column_order},
+                                                                             {null_precedence},
+                                                                             stream,
+                                                                             mr);
 
   return make_lists_column(input.size(),
                            std::move(output_offset),
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 25c594e9e74..39476a2f534 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -73,7 +73,8 @@ bool is_sorted(cudf::table_view const& in,
 
 bool is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence)
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   if (in.num_columns() == 0 || in.num_rows() == 0) { return true; }
@@ -89,7 +90,7 @@ bool is_sorted(cudf::table_view const& in,
       "Number of columns in the table doesn't match the vector null_precedence's size .\n");
   }
 
-  return detail::is_sorted(in, column_order, null_precedence, cudf::get_default_stream());
+  return detail::is_sorted(in, column_order, null_precedence, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index fd65e38d467..3ead8cfcbaa 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -366,16 +366,11 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_policy null_handling,
                              null_order null_precedence,
                              bool percentage,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rank(input,
-                      method,
-                      column_order,
-                      null_handling,
-                      null_precedence,
-                      percentage,
-                      cudf::get_default_stream(),
-                      mr);
+  return detail::rank(
+    input, method, column_order, null_handling, null_precedence, percentage, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 38d008c120c..d9457341bd2 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,11 +81,12 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                column_view const& segment_offsets,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
@@ -93,11 +94,12 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              column_view const& segment_offsets,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    values, keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 37664f33762..5d11bf055f1 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -166,7 +166,7 @@ std::unique_ptr<column> fast_segmented_sorted_order(column_view const& input,
   // Unfortunately, CUB's segmented sort functions cannot accept iterators.
   // We have to build a pre-filled sequence of indices as input.
   auto sorted_indices =
-    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0}, stream, mr);
+    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0, true, stream}, stream, mr);
   auto indices_view = sorted_indices->mutable_view();
 
   cudf::type_dispatcher<dispatch_storage_type>(input.type(),
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 25b95af4f83..46edae798d4 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -109,30 +109,32 @@ std::unique_ptr<table> sort(table_view const& input,
 std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sorted_order(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sorted_order(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
+                            rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
                                    table_view const& keys,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_by_key(
-    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 40df1b50279..4725d65e05d 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,11 +55,12 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
@@ -67,11 +68,12 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     column_view const& segment_offsets,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    values, keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 6f5678c4168..cf602dcf1a9 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -62,22 +62,22 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
 std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sorted_order(
-    input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           table_view const& keys,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
+                                          rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sort_by_key(
-    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ba4921848d7..c7d3e2af19f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -621,17 +621,18 @@ ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
 
-ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
diff --git a/cpp/tests/streams/sorting_test.cpp b/cpp/tests/streams/sorting_test.cpp
new file mode 100644
index 00000000000..e481f95bded
--- /dev/null
+++ b/cpp/tests/streams/sorting_test.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class SortingTest : public cudf::test::BaseFixture {};
+
+TEST_F(SortingTest, SortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::sorted_order(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::stable_sorted_order(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, IsSorted)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::is_sorted(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, Sort)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::sort(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{10, 20, 30, 40, 50};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{10, 20, 30, 40, 50};
+  cudf::table_view const keys{{keys_col}};
+
+  cudf::sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{10, 20, 30, 40, 50};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{10, 20, 30, 40, 50};
+  cudf::table_view const keys{{keys_col}};
+
+  cudf::stable_sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, Rank)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::rank(column,
+             cudf::rank_method::AVERAGE,
+             cudf::order::ASCENDING,
+             cudf::null_policy::EXCLUDE,
+             cudf::null_order::AFTER,
+             false,
+             cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SegmentedSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{3, 7};
+
+  cudf::segmented_sorted_order(keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSegmentedSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{3, 7};
+
+  cudf::stable_segmented_sorted_order(
+    keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SegmentedSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{0, 3, 7, 10};
+
+  cudf::segmented_sort_by_key(
+    values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSegmentedSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{0, 3, 7, 10};
+
+  cudf::stable_segmented_sort_by_key(
+    values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}

From 71f30bec80194e8711156cea90d09b4ee0c940bd Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 22 Sep 2023 17:59:25 -0700
Subject: [PATCH 205/230] Enable direct ingestion and production of Arrow
 scalars (#14121)

This PR adds overloads of `from_arrow` and `to_arrow` for scalars to enable interoperability on par with Arrow Arrays. The new public APIs accept streams, and for consistency streams have also been added to the corresponding column APIs, so this PR contributes to #925.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14121
---
 cpp/include/cudf/detail/interop.hpp   |  80 ++++++++++++++++++--
 cpp/include/cudf/interop.hpp          |  35 ++++++++-
 cpp/src/interop/from_arrow.cu         |  88 +++++++++++++++++++++-
 cpp/src/interop/to_arrow.cu           |  99 +++++++++++++++++++------
 cpp/tests/CMakeLists.txt              |   1 +
 cpp/tests/interop/from_arrow_test.cpp |  95 ++++++++++++++++++++++++
 cpp/tests/interop/to_arrow_test.cpp   | 103 ++++++++++++++++++++++++++
 cpp/tests/streams/interop_test.cpp    |  68 +++++++++++++++++
 8 files changed, 540 insertions(+), 29 deletions(-)
 create mode 100644 cpp/tests/streams/interop_test.cpp

diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 3d4832c8d17..44024333239 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -104,13 +104,67 @@ std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
   }
 }
 
+/**
+ * @brief Invokes an `operator()` template with the type instantiation based on
+ * the specified `arrow::DataType`'s `id()`.
+ *
+ * This function is analogous to libcudf's type_dispatcher, but instead applies
+ * to Arrow functions. Its primary use case is to leverage Arrow's
+ * metaprogramming facilities like arrow::TypeTraits that require translating
+ * the runtime dtype information into compile-time types.
+ */
+template <typename Functor, typename... Ts>
+constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype,
+                                               Functor f,
+                                               Ts&&... args)
+{
+  switch (dtype.id()) {
+    case arrow::Type::INT8:
+      return f.template operator()<arrow::Int8Type>(std::forward<Ts>(args)...);
+    case arrow::Type::INT16:
+      return f.template operator()<arrow::Int16Type>(std::forward<Ts>(args)...);
+    case arrow::Type::INT32:
+      return f.template operator()<arrow::Int32Type>(std::forward<Ts>(args)...);
+    case arrow::Type::INT64:
+      return f.template operator()<arrow::Int64Type>(std::forward<Ts>(args)...);
+    case arrow::Type::UINT8:
+      return f.template operator()<arrow::UInt8Type>(std::forward<Ts>(args)...);
+    case arrow::Type::UINT16:
+      return f.template operator()<arrow::UInt16Type>(std::forward<Ts>(args)...);
+    case arrow::Type::UINT32:
+      return f.template operator()<arrow::UInt32Type>(std::forward<Ts>(args)...);
+    case arrow::Type::UINT64:
+      return f.template operator()<arrow::UInt64Type>(std::forward<Ts>(args)...);
+    case arrow::Type::FLOAT:
+      return f.template operator()<arrow::FloatType>(std::forward<Ts>(args)...);
+    case arrow::Type::DOUBLE:
+      return f.template operator()<arrow::DoubleType>(std::forward<Ts>(args)...);
+    case arrow::Type::BOOL:
+      return f.template operator()<arrow::BooleanType>(std::forward<Ts>(args)...);
+    case arrow::Type::TIMESTAMP:
+      return f.template operator()<arrow::TimestampType>(std::forward<Ts>(args)...);
+    case arrow::Type::DURATION:
+      return f.template operator()<arrow::DurationType>(std::forward<Ts>(args)...);
+    case arrow::Type::STRING:
+      return f.template operator()<arrow::StringType>(std::forward<Ts>(args)...);
+    case arrow::Type::LIST:
+      return f.template operator()<arrow::ListType>(std::forward<Ts>(args)...);
+    case arrow::Type::DECIMAL128:
+      return f.template operator()<arrow::Decimal128Type>(std::forward<Ts>(args)...);
+    case arrow::Type::STRUCT:
+      return f.template operator()<arrow::StructType>(std::forward<Ts>(args)...);
+    default: {
+      CUDF_FAIL("Invalid type.");
+    }
+  }
+}
+
 // Converting arrow type to cudf type
 data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
 
 /**
- * @copydoc cudf::to_arrow
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::to_arrow(table_view input, std::vector<column_metadata> const& metadata,
+ * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr)
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata,
@@ -118,13 +172,27 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        arrow::MemoryPool* ar_mr);
 
 /**
- * @copydoc cudf::arrow_to_cudf
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata,
+ * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr)
+ */
+std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
+                                        column_metadata const& metadata,
+                                        rmm::cuda_stream_view stream,
+                                        arrow::MemoryPool* ar_mr);
+/**
+ * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream,
+ * rmm::mr::device_memory_resource* mr)
  */
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream,
+ * rmm::mr::device_memory_resource* mr)
+ */
+std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index e210179b147..865cc004107 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -126,23 +126,56 @@ struct column_metadata {
  *
  * @param input table_view that needs to be converted to arrow Table
  * @param metadata Contains hierarchy of names of columns and children
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param ar_mr arrow memory pool to allocate memory for arrow Table
  * @return arrow Table generated from `input`
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
-                                       arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+                                       rmm::cuda_stream_view stream = cudf::get_default_stream(),
+                                       arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
 
+/**
+ * @brief Create `arrow::Scalar` from cudf scalar `input`
+ *
+ * Converts the `cudf::scalar` to `arrow::Scalar`.
+ *
+ * @param input scalar that needs to be converted to arrow Scalar
+ * @param metadata Contains hierarchy of names of columns and children
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param ar_mr arrow memory pool to allocate memory for arrow Scalar
+ * @return arrow Scalar generated from `input`
+ */
+std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
+                                        column_metadata const& metadata = {},
+                                        rmm::cuda_stream_view stream = cudf::get_default_stream(),
+                                        arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
  * @param input arrow:Table that needs to be converted to `cudf::table`
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
 
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `cudf::scalar` from given arrow Scalar input
+ *
+ * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr    Device memory resource used to allocate `cudf::scalar`
+ * @return cudf scalar generated from given arrow Scalar
+ */
+
+std::unique_ptr<cudf::scalar> from_arrow(
+  arrow::Scalar const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 30cfee97fd8..e39625c92e7 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -419,6 +419,52 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
            : get_empty_type_column(array.length());
 }
 
+struct BuilderGenerator {
+  template <typename T,
+            CUDF_ENABLE_IF(!std::is_same_v<T, arrow::ListType> &&
+                           !std::is_same_v<T, arrow::StructType>)>
+  std::shared_ptr<arrow::ArrayBuilder> operator()(std::shared_ptr<arrow::DataType> const& type)
+  {
+    return std::make_shared<typename arrow::TypeTraits<T>::BuilderType>(
+      type, arrow::default_memory_pool());
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(std::is_same_v<T, arrow::ListType> ||
+                           std::is_same_v<T, arrow::StructType>)>
+  std::shared_ptr<arrow::ArrayBuilder> operator()(std::shared_ptr<arrow::DataType> const& type)
+  {
+    CUDF_FAIL("Type not supported by BuilderGenerator");
+  }
+};
+
+std::shared_ptr<arrow::ArrayBuilder> make_builder(std::shared_ptr<arrow::DataType> const& type)
+{
+  switch (type->id()) {
+    case arrow::Type::STRUCT: {
+      std::vector<std::shared_ptr<arrow::ArrayBuilder>> field_builders;
+
+      for (auto field : type->fields()) {
+        auto const vt = field->type();
+        if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) {
+          field_builders.push_back(make_builder(vt));
+        } else {
+          field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt));
+        }
+      }
+      return std::make_shared<arrow::StructBuilder>(
+        type, arrow::default_memory_pool(), field_builders);
+    }
+    case arrow::Type::LIST: {
+      return std::make_shared<arrow::ListBuilder>(arrow::default_memory_pool(),
+                                                  make_builder(type->field(0)->type()));
+    }
+    default: {
+      return arrow_type_dispatcher(*type, BuilderGenerator{}, type);
+    }
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
@@ -462,14 +508,54 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
   return std::make_unique<table>(std::move(columns));
 }
 
+std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  // Get a builder for the scalar type
+  auto builder = detail::make_builder(input.type);
+
+  auto status = builder->AppendScalar(input);
+  if (status != arrow::Status::OK()) {
+    if (status.IsNotImplemented()) {
+      // The only known failure case here is for nulls
+      CUDF_FAIL("Cannot create untyped null scalars or nested types with untyped null leaf nodes",
+                std::invalid_argument);
+    }
+    CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed");
+  }
+
+  auto maybe_array = builder->Finish();
+  if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); }
+  auto array = *maybe_array;
+
+  auto field = arrow::field("", input.type);
+
+  auto table = arrow::Table::Make(arrow::schema({field}), {array});
+
+  auto cudf_table = detail::from_arrow(*table, stream, mr);
+
+  auto cv = cudf_table->view().column(0);
+  return get_element(cv, 0, stream);
+}
+
 }  // namespace detail
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
-  return detail::from_arrow(input_table, cudf::get_default_stream(), mr);
+  return detail::from_arrow(input_table, stream, mr);
 }
 
+std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  return detail::from_arrow(input, stream, mr);
+}
 }  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 958a2fcb95f..0cd750bc947 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -15,14 +15,16 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -77,7 +79,10 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
     auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       mask_buffer->mutable_data(),
-      (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(),
+      (input_view.offset() > 0)
+        ? cudf::detail::copy_bitmask(input_view, stream, rmm::mr::get_current_device_resource())
+            .data()
+        : input_view.null_mask(),
       mask_size_in_bytes,
       cudaMemcpyDefault,
       stream.value()));
@@ -139,29 +144,36 @@ struct dispatch_to_arrow {
   }
 };
 
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
+// Convert decimal types from libcudf to arrow where those types are not
+// directly supported by Arrow. These types must be fit into 128 bits, the
+// smallest decimal resolution supported by Arrow.
+template <typename DeviceType>
+std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
+                                                            int32_t precision,
+                                                            arrow::MemoryPool* ar_mr,
+                                                            rmm::cuda_stream_view stream)
 {
-  using DeviceType                = int64_t;
-  size_type const BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
 
   rmm::device_uvector<DeviceType> buf(input.size() * BIT_WIDTH_RATIO, stream);
 
   auto count = thrust::make_counting_iterator(0);
 
-  thrust::for_each(rmm::exec_policy(cudf::get_default_stream()),
-                   count,
-                   count + input.size(),
-                   [in = input.begin<DeviceType>(), out = buf.data()] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * 2;
-                     out[out_idx]       = in[in_idx];
-                     out[out_idx + 1]   = in[in_idx] < 0 ? -1 : 0;
-                   });
+  thrust::for_each(
+    rmm::exec_policy(cudf::get_default_stream()),
+    count,
+    count + input.size(),
+    [in = input.begin<DeviceType>(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
+      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+      // The lowest order bits are the value, the remainder
+      // simply matches the sign bit to satisfy the two's
+      // complement integer representation of negative numbers.
+      out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+      }
+    });
 
   auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
@@ -169,7 +181,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   CUDF_CUDA_TRY(cudaMemcpyAsync(
     data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
 
-  auto type    = arrow::decimal(18, -input.type().scale());
+  auto type    = arrow::decimal(precision, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
   auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
   auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
@@ -177,6 +189,28 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   return std::make_shared<arrow::Decimal128Array>(data);
 }
 
+template <>
+std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal32>(
+  column_view input,
+  cudf::type_id,
+  column_metadata const&,
+  arrow::MemoryPool* ar_mr,
+  rmm::cuda_stream_view stream)
+{
+  return unsupported_decimals_to_arrow<int32_t>(input, 9, ar_mr, stream);
+}
+
+template <>
+std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
+  column_view input,
+  cudf::type_id,
+  column_metadata const&,
+  arrow::MemoryPool* ar_mr,
+  rmm::cuda_stream_view stream)
+{
+  return unsupported_decimals_to_arrow<int64_t>(input, 18, ar_mr, stream);
+}
+
 template <>
 std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>(
   column_view input,
@@ -403,14 +437,37 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
 
   return result;
 }
+
+std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
+                                        column_metadata const& metadata,
+                                        rmm::cuda_stream_view stream,
+                                        arrow::MemoryPool* ar_mr)
+{
+  auto const column = cudf::make_column_from_scalar(input, 1, stream);
+  cudf::table_view const tv{{column->view()}};
+  auto const arrow_table  = cudf::to_arrow(tv, {metadata}, stream);
+  auto const ac           = arrow_table->column(0);
+  auto const maybe_scalar = ac->GetScalar(0);
+  if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); }
+  return maybe_scalar.ValueOrDie();
+}
 }  // namespace detail
 
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata,
+                                       rmm::cuda_stream_view stream,
                                        arrow::MemoryPool* ar_mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr);
+  return detail::to_arrow(input, metadata, stream, ar_mr);
 }
 
+std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
+                                        column_metadata const& metadata,
+                                        rmm::cuda_stream_view stream,
+                                        arrow::MemoryPool* ar_mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow(input, metadata, stream, ar_mr);
+}
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c7d3e2af19f..956bfc7c27d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -626,6 +626,7 @@ ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 9a5cc3733af..a898106a5b2 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -456,3 +456,98 @@ INSTANTIATE_TEST_CASE_P(FromArrowTest,
                                           std::make_tuple(0, 0),
                                           std::make_tuple(0, 3000),
                                           std::make_tuple(10000, 10000)));
+
+template <typename T>
+struct FromArrowNumericScalarTest : public cudf::test::BaseFixture {};
+
+using NumericTypesNotBool =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(FromArrowNumericScalarTest, NumericTypesNotBool);
+
+TYPED_TEST(FromArrowNumericScalarTest, Basic)
+{
+  TypeParam const value{42};
+  auto const arrow_scalar = arrow::MakeScalar(value);
+  auto const cudf_scalar  = cudf::from_arrow(*arrow_scalar);
+  auto const cudf_numeric_scalar =
+    dynamic_cast<cudf::numeric_scalar<TypeParam>*>(cudf_scalar.get());
+  if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); }
+  EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id<TypeParam>()));
+  EXPECT_EQ(cudf_numeric_scalar->value(), value);
+}
+
+struct FromArrowDecimalScalarTest : public cudf::test::BaseFixture {};
+
+// Only testing Decimal128 because that's the only size cudf and arrow have in common.
+TEST_F(FromArrowDecimalScalarTest, Basic)
+{
+  auto const value{42};
+  auto const precision{8};
+  auto const scale{4};
+  auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale));
+  auto cudf_scalar  = cudf::from_arrow(arrow_scalar);
+
+  // Arrow offers a minimum of 128 bits for the Decimal type.
+  auto const cudf_decimal_scalar =
+    dynamic_cast<cudf::fixed_point_scalar<numeric::decimal128>*>(cudf_scalar.get());
+  EXPECT_EQ(cudf_decimal_scalar->type(),
+            cudf::data_type(cudf::type_to_id<numeric::decimal128>(), scale));
+  EXPECT_EQ(cudf_decimal_scalar->value(), value);
+}
+
+struct FromArrowStringScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(FromArrowStringScalarTest, Basic)
+{
+  auto const value        = std::string("hello world");
+  auto const arrow_scalar = arrow::StringScalar(value);
+  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+
+  auto const cudf_string_scalar = dynamic_cast<cudf::string_scalar*>(cudf_scalar.get());
+  EXPECT_EQ(cudf_string_scalar->type(), cudf::data_type(cudf::type_id::STRING));
+  EXPECT_EQ(cudf_string_scalar->to_string(), value);
+}
+
+struct FromArrowListScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(FromArrowListScalarTest, Basic)
+{
+  std::vector<int64_t> host_values = {1, 2, 3, 5, 6, 7, 8};
+  std::vector<bool> host_validity  = {true, true, true, false, true, true, true};
+
+  arrow::Int64Builder builder;
+  auto const status      = builder.AppendValues(host_values, host_validity);
+  auto const maybe_array = builder.Finish();
+  auto const array       = *maybe_array;
+
+  auto const arrow_scalar = arrow::ListScalar(array);
+  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+
+  auto const cudf_list_scalar = dynamic_cast<cudf::list_scalar*>(cudf_scalar.get());
+  EXPECT_EQ(cudf_list_scalar->type(), cudf::data_type(cudf::type_id::LIST));
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const lhs(
+    host_values.begin(), host_values.end(), host_validity.begin());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(lhs, cudf_list_scalar->view());
+}
+
+struct FromArrowStructScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(FromArrowStructScalarTest, Basic)
+{
+  int64_t const value{42};
+  auto const underlying_arrow_scalar = arrow::MakeScalar(value);
+
+  auto const field        = arrow::field("", underlying_arrow_scalar->type);
+  auto const arrow_type   = arrow::struct_({field});
+  auto const arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type);
+  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+
+  auto const cudf_struct_scalar = dynamic_cast<cudf::struct_scalar*>(cudf_scalar.get());
+  EXPECT_EQ(cudf_struct_scalar->type(), cudf::data_type(cudf::type_id::STRUCT));
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const col({value});
+  cudf::table_view const lhs({col});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view());
+}
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 97d80984272..6bb4cdfd747 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
@@ -578,4 +579,106 @@ INSTANTIATE_TEST_CASE_P(ToArrowTest,
                                           std::make_tuple(0, 0),
                                           std::make_tuple(0, 3000)));
 
+template <typename T>
+struct ToArrowNumericScalarTest : public cudf::test::BaseFixture {};
+
+using NumericTypesNotBool =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(ToArrowNumericScalarTest, NumericTypesNotBool);
+
+TYPED_TEST(ToArrowNumericScalarTest, Basic)
+{
+  TypeParam const value{42};
+  auto const cudf_scalar = cudf::make_fixed_width_scalar<TypeParam>(value);
+
+  cudf::column_metadata const metadata{""};
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const ref_arrow_scalar = arrow::MakeScalar(value);
+  EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
+}
+
+struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {};
+
+// Only testing Decimal128 because that's the only size cudf and arrow have in common.
+TEST_F(ToArrowDecimalScalarTest, Basic)
+{
+  auto const value{42};
+  auto const precision{18};  // cudf will convert to the widest-precision Arrow scalar of the type
+  int32_t const scale{4};
+
+  auto const cudf_scalar =
+    cudf::make_fixed_point_scalar<numeric::decimal128>(value, numeric::scale_type{scale});
+
+  cudf::column_metadata const metadata{""};
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const maybe_ref_arrow_scalar =
+    arrow::MakeScalar(arrow::decimal128(precision, -scale), value);
+  if (!maybe_ref_arrow_scalar.ok()) { CUDF_FAIL("Failed to construct reference scalar"); }
+  auto const ref_arrow_scalar = *maybe_ref_arrow_scalar;
+  EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
+}
+
+struct ToArrowStringScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(ToArrowStringScalarTest, Basic)
+{
+  std::string const value{"hello world"};
+  auto const cudf_scalar = cudf::make_string_scalar(value);
+  cudf::column_metadata const metadata{""};
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const ref_arrow_scalar = arrow::MakeScalar(value);
+  EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
+}
+
+struct ToArrowListScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(ToArrowListScalarTest, Basic)
+{
+  std::vector<int64_t> const host_values = {1, 2, 3, 5, 6, 7, 8};
+  std::vector<bool> const host_validity  = {true, true, true, false, true, true, true};
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const col(
+    host_values.begin(), host_values.end(), host_validity.begin());
+
+  auto const cudf_scalar = cudf::make_list_scalar(col);
+
+  cudf::column_metadata const metadata{""};
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  arrow::Int64Builder builder;
+  auto const status      = builder.AppendValues(host_values, host_validity);
+  auto const maybe_array = builder.Finish();
+  auto const array       = *maybe_array;
+
+  auto const ref_arrow_scalar = arrow::ListScalar(array);
+
+  EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar));
+}
+
+struct ToArrowStructScalarTest : public cudf::test::BaseFixture {};
+
+TEST_F(ToArrowStructScalarTest, Basic)
+{
+  int64_t const value{42};
+  auto const field_name{"a"};
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const col{value};
+  cudf::table_view const tbl({col});
+  auto const cudf_scalar = cudf::make_struct_scalar(tbl);
+
+  cudf::column_metadata metadata{""};
+  metadata.children_meta.emplace_back(field_name);
+  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const underlying_arrow_scalar = arrow::MakeScalar(value);
+  auto const field            = arrow::field(field_name, underlying_arrow_scalar->type, false);
+  auto const arrow_type       = arrow::struct_({field});
+  auto const ref_arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type);
+
+  EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
new file mode 100644
index 00000000000..7eac9e016eb
--- /dev/null
+++ b/cpp/tests/streams/interop_test.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/interop.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+struct ArrowTest : public cudf::test::BaseFixture {};
+
+TEST_F(ArrowTest, ToArrow)
+{
+  int32_t const value{42};
+  auto col = cudf::test::fixed_width_column_wrapper<int32_t>{{value}};
+  cudf::table_view tbl{{col}};
+
+  std::vector<cudf::column_metadata> metadata{{""}};
+  cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream());
+}
+
+TEST_F(ArrowTest, FromArrow)
+{
+  std::vector<int64_t> host_values = {1, 2, 3, 5, 6, 7, 8};
+  std::vector<bool> host_validity  = {true, true, true, false, true, true, true};
+
+  arrow::Int64Builder builder;
+  auto status      = builder.AppendValues(host_values, host_validity);
+  auto maybe_array = builder.Finish();
+  auto array       = *maybe_array;
+
+  auto field  = arrow::field("", arrow::int32());
+  auto schema = arrow::schema({field});
+  auto table  = arrow::Table::Make(schema, {array});
+  cudf::from_arrow(*table, cudf::test::get_default_stream());
+}
+
+TEST_F(ArrowTest, ToArrowScalar)
+{
+  int32_t const value{42};
+  auto cudf_scalar =
+    cudf::make_fixed_width_scalar<int32_t>(value, cudf::test::get_default_stream());
+
+  cudf::column_metadata metadata{""};
+  cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream());
+}
+
+TEST_F(ArrowTest, FromArrowScalar)
+{
+  int32_t const value{42};
+  auto arrow_scalar = arrow::MakeScalar(value);
+  cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream());
+}

From d67cc5d05a6c18dd832f7b63421296fb66ae56f1 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Fri, 22 Sep 2023 22:01:40 -0700
Subject: [PATCH 206/230] Fix assert failure for range window functions
 (#14168)

Authors:
  - MithunR (https://github.com/mythrocks)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14168
---
 cpp/src/rolling/grouped_rolling.cu | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 6e69b5157c2..7ac784bef43 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -357,6 +357,16 @@ template <typename T>
 struct device_value_accessor {
   column_device_view const col;  ///< column view of column in device
 
+  /// Checks that the type used to access device values matches the rep-type
+  /// of the order-by column.
+  struct is_correct_range_rep {
+    template <typename U>  /// Order-by type.
+    constexpr bool operator()() const
+    {
+      return std::is_same_v<T, cudf::detail::range_rep_type<U>>;
+    }
+  };
+
   /**
    * @brief constructor
    *
@@ -364,8 +374,11 @@ struct device_value_accessor {
    */
   explicit __device__ device_value_accessor(column_device_view const& col_) : col{col_}
   {
-    cudf_assert(type_id_matches_device_storage_type<T>(col.type().id()) &&
-                "the data type mismatch");
+    // For non-timestamp types, T must match the order-by column's type.
+    // For timestamp types, T must match the range rep type for the order-by column.
+    cudf_assert((type_id_matches_device_storage_type<T>(col.type().id()) or
+                 cudf::type_dispatcher(col.type(), is_correct_range_rep{})) &&
+                "data type mismatch when accessing the order-by column");
   }
 
   /**

From fe3cab5595337300345573d7e64fa52cba78a6c5 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 25 Sep 2023 10:15:44 +0530
Subject: [PATCH 207/230] Fix Memcheck error found in JSON_TEST
 JsonReaderTest.ErrorStrings (#14164)

Fix missing null mask in string column names parsing. For parsing error, the row is made null. To write output properly, the nulls need to be passed so that they can be skipped during writing output stage in `parse_data`.
Fixes #14141

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Elias Stehle (https://github.com/elstehle)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14164
---
 cpp/src/io/utilities/data_casting.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 1772e5e43fa..d16237d7afe 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -924,6 +924,9 @@ std::unique_ptr<column> parse_data(
   if (col_size == 0) { return make_empty_column(col_type); }
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
+  if (null_mask.is_empty()) {
+    null_mask = cudf::detail::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
+  }
 
   // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
   auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});

From 3f47b5d463445faa9f95b1cc57c46fb5b41f60a7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 25 Sep 2023 11:28:33 -0400
Subject: [PATCH 208/230] Move cpp/src/hash/hash_allocator.cuh to
 include/cudf/hashing/detail (#14163)

Moves `cpp/src/hash/hash_allocator.cuh` to `include/cudf/hashing/detail` so it may be more accessible from non-src/hash source files.
Also, found `cpp/src/hash/helper_functions.hpp` used in the same way a moved that one as well.
No functional changes, just headers moved and includes fixed up.

Reference: https://github.com/rapidsai/cudf/pull/13930#discussion_r1330118935

Closes #14143

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14163
---
 .../hash => include/cudf/hashing/detail}/hash_allocator.cuh  | 0
 .../cudf/hashing/detail}/helper_functions.cuh                | 0
 cpp/src/hash/concurrent_unordered_map.cuh                    | 4 ++--
 cpp/src/hash/unordered_multiset.cuh                          | 3 +--
 cpp/src/io/json/json_tree.cu                                 | 4 ++--
 cpp/src/join/join_common_utils.hpp                           | 5 ++---
 cpp/src/stream_compaction/stream_compaction_common.hpp       | 5 ++---
 cpp/src/text/subword/bpe_tokenizer.cuh                       | 3 +--
 8 files changed, 10 insertions(+), 14 deletions(-)
 rename cpp/{src/hash => include/cudf/hashing/detail}/hash_allocator.cuh (100%)
 rename cpp/{src/hash => include/cudf/hashing/detail}/helper_functions.cuh (100%)

diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/include/cudf/hashing/detail/hash_allocator.cuh
similarity index 100%
rename from cpp/src/hash/hash_allocator.cuh
rename to cpp/include/cudf/hashing/detail/hash_allocator.cuh
diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh
similarity index 100%
rename from cpp/src/hash/helper_functions.cuh
rename to cpp/include/cudf/hashing/detail/helper_functions.cuh
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 439b1c2d066..d773c2763df 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
 #include <hash/managed.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index 87075a39ea3..183042fc0f4 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -16,11 +16,10 @@
 
 #pragma once
 
-#include <hash/helper_functions.cuh>
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 9231040eb70..da5b0eedfbd 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -15,8 +15,6 @@
  */
 
 #include "nested_json.hpp"
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
 #include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -24,7 +22,9 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 4c1b1ed98b1..e96505e5ed6 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -17,13 +17,12 @@
 
 #include <cudf/detail/join.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
-
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_map.cuh>
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
index 58d958d2ff4..18c531e3e69 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -15,12 +15,11 @@
  */
 #pragma once
 
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
-
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_map.cuh>
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 83aa22aaae9..2fa879ea734 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -18,10 +18,9 @@
 
 #include <nvtext/bpe_tokenize.hpp>
 
-#include <hash/hash_allocator.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
 

From 036c07d363406da9e500c3d6be9a3edca28fd6c2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Sep 2023 06:36:26 -1000
Subject: [PATCH 209/230] Fix DataFrame from Series with different
 CategoricalIndexes (#14157)

closes #14130

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14157
---
 python/cudf/cudf/core/indexed_frame.py   |  7 +++++++
 python/cudf/cudf/tests/test_dataframe.py | 13 +++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 62e091b29b5..aacf1fa8dae 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5438,6 +5438,13 @@ def _is_same_dtype(lhs_dtype, rhs_dtype):
     # for matching column dtype.
     if lhs_dtype == rhs_dtype:
         return True
+    elif (
+        is_categorical_dtype(lhs_dtype)
+        and is_categorical_dtype(rhs_dtype)
+        and lhs_dtype.categories.dtype == rhs_dtype.categories.dtype
+    ):
+        # OK if categories are not all the same
+        return True
     elif (
         is_categorical_dtype(lhs_dtype)
         and not is_categorical_dtype(rhs_dtype)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2f531afdeb7..67b63028fab 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10408,6 +10408,19 @@ def test_dataframe_init_from_nested_dict():
     assert_eq(pdf, gdf)
 
 
+def test_init_from_2_categoricalindex_series_diff_categories():
+    s1 = cudf.Series(
+        [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"])
+    )
+    s2 = cudf.Series(
+        [2, 152, 2, 242, 150],
+        index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]),
+    )
+    result = cudf.DataFrame([s1, s2])
+    expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()])
+    assert_eq(result, expected, check_dtype=False)
+
+
 def test_data_frame_values_no_cols_but_index():
     result = cudf.DataFrame(index=range(5)).values
     expected = pd.DataFrame(index=range(5)).values

From ddd2b0dfac0903c5f17d581eca5d6b945ede9451 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 25 Sep 2023 13:14:18 -0500
Subject: [PATCH 210/230] Allow explicit `shuffle="p2p"` within dask-cudf API
 (#13893)

This PR allows explicit `shuffle="p2p"` usage within the dask-cudf API now that https://github.com/dask/distributed/pull/7743 is in.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ray Douglass (https://github.com/raydouglass)
  - gpuCI (https://github.com/GPUtester)
  - Mike Wendt (https://github.com/mike-wendt)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13893
---
 python/dask_cudf/dask_cudf/backends.py        | 31 ++++++++++++++++---
 python/dask_cudf/dask_cudf/sorting.py         | 26 +++++++++++-----
 .../dask_cudf/tests/test_dispatch.py          | 11 +++++--
 .../dask_cudf/tests/test_distributed.py       | 22 ++++++++++++-
 4 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index e3f4f04eb85..344b03c631d 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -373,22 +373,37 @@ def percentile_cudf(a, q, interpolation="linear"):
 
 
 @pyarrow_schema_dispatch.register((cudf.DataFrame,))
-def _get_pyarrow_schema_cudf(obj, preserve_index=True, **kwargs):
+def _get_pyarrow_schema_cudf(obj, preserve_index=None, **kwargs):
     if kwargs:
         warnings.warn(
             "Ignoring the following arguments to "
             f"`pyarrow_schema_dispatch`: {list(kwargs)}"
         )
-    return meta_nonempty(obj).to_arrow(preserve_index=preserve_index).schema
+
+    return _cudf_to_table(
+        meta_nonempty(obj), preserve_index=preserve_index
+    ).schema
 
 
 @to_pyarrow_table_dispatch.register(cudf.DataFrame)
-def _cudf_to_table(obj, preserve_index=True, **kwargs):
+def _cudf_to_table(obj, preserve_index=None, **kwargs):
     if kwargs:
         warnings.warn(
             "Ignoring the following arguments to "
             f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
         )
+
+    # TODO: Remove this logic when cudf#14159 is resolved
+    # (see: https://github.com/rapidsai/cudf/issues/14159)
+    if preserve_index and isinstance(obj.index, cudf.RangeIndex):
+        obj = obj.copy()
+        obj.index.name = (
+            obj.index.name
+            if obj.index.name is not None
+            else "__index_level_0__"
+        )
+        obj.index = obj.index._as_int_index()
+
     return obj.to_arrow(preserve_index=preserve_index)
 
 
@@ -401,7 +416,15 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
             f"Ignoring the following arguments to "
             f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-    return obj.from_arrow(table)
+    result = obj.from_arrow(table)
+
+    # TODO: Remove this logic when cudf#14159 is resolved
+    # (see: https://github.com/rapidsai/cudf/issues/14159)
+    if "__index_level_0__" in result.index.names:
+        assert len(result.index.names) == 1
+        result.index.name = None
+
+    return result
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index e841f2d8830..d6c9c1be73c 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -6,7 +6,7 @@
 import numpy as np
 import tlz as toolz
 
-import dask
+from dask import config
 from dask.base import tokenize
 from dask.dataframe import methods
 from dask.dataframe.core import DataFrame, Index, Series
@@ -18,6 +18,8 @@
 from cudf.api.types import is_categorical_dtype
 from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
+_SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
+
 
 @_dask_cudf_nvtx_annotate
 def set_index_post(df, index_name, drop, column_dtype):
@@ -307,15 +309,25 @@ def sort_values(
     return df4
 
 
+def get_default_shuffle_method():
+    # Note that `dask.utils.get_default_shuffle_method`
+    # will return "p2p" by default when a distributed
+    # client is present. Dask-cudf supports "p2p", but
+    # will not use it by default (yet)
+    default = config.get("dataframe.shuffle.method", "tasks")
+    if default not in _SHUFFLE_SUPPORT:
+        default = "tasks"
+    return default
+
+
 def _get_shuffle_type(shuffle):
     # Utility to set the shuffle-kwarg default
-    # and to validate user-specified options.
-    # The only supported options is currently "tasks"
-    shuffle = shuffle or dask.config.get("shuffle", "tasks")
-    if shuffle != "tasks":
+    # and to validate user-specified options
+    shuffle = shuffle or get_default_shuffle_method()
+    if shuffle not in _SHUFFLE_SUPPORT:
         raise ValueError(
-            f"Dask-cudf only supports in-memory shuffling with "
-            f"'tasks'. Got shuffle={shuffle}"
+            "Dask-cudf only supports the following shuffle "
+            f"methods: {_SHUFFLE_SUPPORT}. Got shuffle={shuffle}"
         )
 
     return shuffle
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index cf49b1df4f4..c64e25fd437 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -22,18 +22,25 @@ def test_is_categorical_dispatch():
     assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
 
 
-def test_pyarrow_conversion_dispatch():
+@pytest.mark.parametrize("preserve_index", [True, False])
+def test_pyarrow_conversion_dispatch(preserve_index):
     from dask.dataframe.dispatch import (
         from_pyarrow_table_dispatch,
         to_pyarrow_table_dispatch,
     )
 
     df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
-    df2 = from_pyarrow_table_dispatch(df1, to_pyarrow_table_dispatch(df1))
+    df2 = from_pyarrow_table_dispatch(
+        df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
+    )
 
     assert type(df1) == type(df2)
     assert_eq(df1, df2)
 
+    # Check that preserve_index does not produce a RangeIndex
+    if preserve_index:
+        assert not isinstance(df2.index, cudf.RangeIndex)
+
 
 @pytest.mark.parametrize("index", [None, [1, 2] * 5])
 def test_deterministic_tokenize(index):
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index e24feaa2ea4..db3f3695648 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numba.cuda
 import pytest
@@ -77,3 +77,23 @@ def test_str_series_roundtrip():
 
             actual = dask_series.compute()
             assert_eq(actual, expected)
+
+
+def test_p2p_shuffle():
+    # Check that we can use `shuffle="p2p"`
+    with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
+        with Client(cluster):
+            ddf = (
+                dask.datasets.timeseries(
+                    start="2000-01-01",
+                    end="2000-01-08",
+                    dtypes={"x": int},
+                )
+                .reset_index(drop=True)
+                .to_backend("cudf")
+            )
+            dd.assert_eq(
+                ddf.sort_values("x", shuffle="p2p").compute(),
+                ddf.compute().sort_values("x"),
+                check_index=False,
+            )

From 1b925bfc7741eb22fed0a978fa0e1d0d5dfee601 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 25 Sep 2023 13:09:16 -0700
Subject: [PATCH 211/230] Add Parquet reader benchmarks for row selection
 (#14147)

Re-enabled the group of benchmarks that compares row selection options in Parquet reader.
Use `read_parquet_metadata` to get the column names and number of row groups.
Clean up read chunk computation for ORC and Parquet benchmarks.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14147
---
 cpp/benchmarks/io/cuio_common.cpp             | 18 ++---
 cpp/benchmarks/io/orc/orc_reader_options.cpp  | 12 ++--
 .../io/parquet/parquet_reader_options.cpp     | 65 +++++++++++--------
 3 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 6b8af91b842..b1aaef41340 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/io/cuio_common.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 
 #include <cstdio>
@@ -141,17 +142,18 @@ std::vector<std::string> select_column_names(std::vector<std::string> const& col
   return col_names_to_read;
 }
 
-std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk)
+std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk_idx)
 {
   CUDF_EXPECTS(num_segments >= num_chunks,
                "Number of chunks cannot be greater than the number of segments in the file");
-  auto start_segment = [num_segments, num_chunks](int chunk) {
-    return num_segments * chunk / num_chunks;
-  };
-  std::vector<cudf::size_type> selected_segments;
-  for (auto segment = start_segment(chunk); segment < start_segment(chunk + 1); ++segment) {
-    selected_segments.push_back(segment);
-  }
+  CUDF_EXPECTS(chunk_idx < num_chunks,
+               "Chunk index must be smaller than the number of chunks in the file");
+
+  auto const segments_in_chunk = cudf::util::div_rounding_up_unsafe(num_segments, num_chunks);
+  auto const begin_segment     = std::min(chunk_idx * segments_in_chunk, num_segments);
+  auto const end_segment       = std::min(begin_segment + segments_in_chunk, num_segments);
+  std::vector<cudf::size_type> selected_segments(end_segment - begin_segment);
+  std::iota(selected_segments.begin(), selected_segments.end(), begin_segment);
 
   return selected_segments;
 }
diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp
index 647a411c89d..1f656f7ea70 100644
--- a/cpp/benchmarks/io/orc/orc_reader_options.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp
@@ -19,6 +19,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -30,7 +31,7 @@
 constexpr int64_t data_size = 512 << 20;
 // The number of separate read calls to use when reading files in multiple chunks
 // Each call reads roughly equal amounts of data
-constexpr int32_t chunked_read_num_chunks = 8;
+constexpr int32_t chunked_read_num_chunks = 4;
 
 std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
 {
@@ -88,7 +89,7 @@ void BM_orc_read_varying_options(nvbench::state& state,
 
   auto const num_stripes =
     cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes();
-  cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
+  auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -99,7 +100,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
       timer.start();
       cudf::size_type rows_read = 0;
       for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
-        auto const is_last_chunk = chunk == (num_chunks - 1);
         switch (RowSelection) {
           case row_selection::ALL: break;
           case row_selection::STRIPES:
@@ -108,7 +108,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
           case row_selection::NROWS:
             read_options.set_skip_rows(chunk * chunk_row_cnt);
             read_options.set_num_rows(chunk_row_cnt);
-            if (is_last_chunk) read_options.set_num_rows(-1);
             break;
           default: CUDF_FAIL("Unsupported row selection method");
         }
@@ -132,9 +131,6 @@ using col_selections = nvbench::enum_type_list<column_selection::ALL,
                                                column_selection::ALTERNATE,
                                                column_selection::FIRST_HALF,
                                                column_selection::SECOND_HALF>;
-using row_selections =
-  nvbench::enum_type_list<row_selection::ALL, row_selection::STRIPES, row_selection::NROWS>;
-
 NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
                     NVBENCH_TYPE_AXES(col_selections,
                                       nvbench::enum_type_list<row_selection::ALL>,
@@ -146,6 +142,8 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
     {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
   .set_min_samples(4);
 
+using row_selections =
+  nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::STRIPES>;
 NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
                     NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
                                       row_selections,
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
index 4105f2182d7..9f221de7da2 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
@@ -19,6 +19,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -26,21 +27,21 @@
 
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
-constexpr std::size_t data_size      = 512 << 20;
-constexpr std::size_t row_group_size = 128 << 20;
+constexpr std::size_t data_size = 512 << 20;
+// The number of separate read calls to use when reading files in multiple chunks
+// Each call reads roughly equal amounts of data
+constexpr int32_t chunked_read_num_chunks = 4;
 
 std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
 {
-  cudf::io::parquet_reader_options const read_options =
-    cudf::io::parquet_reader_options::builder(source);
-  auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info;
-
-  std::vector<std::string> names;
-  names.reserve(schema.size());
-  std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) {
-    return c.name;
-  });
-  return names;
+  auto const top_lvl_cols = cudf::io::read_parquet_metadata(source).schema().root().children();
+  std::vector<std::string> col_names;
+  std::transform(top_lvl_cols.cbegin(),
+                 top_lvl_cols.cend(),
+                 std::back_inserter(col_names),
+                 [](auto const& col_meta) { return col_meta.name(); });
+
+  return col_names;
 }
 
 template <column_selection ColSelection,
@@ -55,6 +56,8 @@ void BM_parquet_read_options(nvbench::state& state,
                                                 nvbench::enum_type<UsesPandasMetadata>,
                                                 nvbench::enum_type<Timestamp>>)
 {
+  auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks;
+
   auto constexpr str_to_categories = ConvertsStrings == converts_strings::YES;
   auto constexpr uses_pd_metadata  = UsesPandasMetadata == uses_pandas_metadata::YES;
 
@@ -87,9 +90,8 @@ void BM_parquet_read_options(nvbench::state& state,
       .use_pandas_metadata(uses_pd_metadata)
       .timestamp_type(ts_type);
 
-  // TODO: add read_parquet_metadata to properly calculate #row_groups
-  auto constexpr num_row_groups = data_size / row_group_size;
-  auto constexpr num_chunks     = 1;
+  auto const num_row_groups = read_parquet_metadata(source_sink.make_source_info()).num_rowgroups();
+  auto const chunk_row_cnt  = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -100,18 +102,15 @@ void BM_parquet_read_options(nvbench::state& state,
       timer.start();
       cudf::size_type rows_read = 0;
       for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
-        auto const is_last_chunk = chunk == (num_chunks - 1);
         switch (RowSelection) {
           case row_selection::ALL: break;
           case row_selection::ROW_GROUPS: {
-            auto row_groups_to_read = segments_in_chunk(num_row_groups, num_chunks, chunk);
-            if (is_last_chunk) {
-              // Need to assume that an additional "overflow" row group is present
-              row_groups_to_read.push_back(num_row_groups);
-            }
-            read_options.set_row_groups({row_groups_to_read});
+            read_options.set_row_groups({segments_in_chunk(num_row_groups, num_chunks, chunk)});
           } break;
-          case row_selection::NROWS: [[fallthrough]];
+          case row_selection::NROWS:
+            read_options.set_skip_rows(chunk * chunk_row_cnt);
+            read_options.set_num_rows(chunk_row_cnt);
+            break;
           default: CUDF_FAIL("Unsupported row selection method");
         }
 
@@ -130,14 +129,26 @@ void BM_parquet_read_options(nvbench::state& state,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+using row_selections =
+  nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::ROW_GROUPS>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
+                                      row_selections,
+                                      nvbench::enum_type_list<converts_strings::YES>,
+                                      nvbench::enum_type_list<uses_pandas_metadata::YES>,
+                                      nvbench::enum_type_list<cudf::type_id::EMPTY>))
+  .set_name("parquet_read_row_selection")
+  .set_type_axes_names({"column_selection",
+                        "row_selection",
+                        "str_to_categories",
+                        "uses_pandas_metadata",
+                        "timestamp_type"})
+  .set_min_samples(4);
+
 using col_selections = nvbench::enum_type_list<column_selection::ALL,
                                                column_selection::ALTERNATE,
                                                column_selection::FIRST_HALF,
                                                column_selection::SECOND_HALF>;
-
-// TODO: row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file
-// and determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
-
 NVBENCH_BENCH_TYPES(BM_parquet_read_options,
                     NVBENCH_TYPE_AXES(col_selections,
                                       nvbench::enum_type_list<row_selection::ALL>,

From f3402c402c2d0be54a6f2060e1bd74e284c1e687 Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Mon, 25 Sep 2023 14:10:44 -0700
Subject: [PATCH 212/230] Add stream parameter to external dict APIs (#14115)

This PR adds stream parameter to public dictionary APIs, which include:

1. `cudf::dictionary::encode`
2. `cudf::dictionary::decode`
3. `cudf::dictionary::get_index`
4. `cudf::dictionary::add_keys`
5. `cudf::dictionary::remove_keys`
6. `cudf::dictionary::remove_unused_keys`
7. `cudf::dictionary::set_keys`
8. `cudf::dictionary::match_dictionaries`

Reference [13744](https://github.com/rapidsai/cudf/issues/13744)

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14115
---
 cpp/include/cudf/dictionary/encode.hpp      |   6 +-
 cpp/include/cudf/dictionary/search.hpp      |   6 +-
 cpp/include/cudf/dictionary/update_keys.hpp |  16 ++-
 cpp/include/cudf_test/column_wrapper.hpp    |  18 +++-
 cpp/src/dictionary/add_keys.cu              |   3 +-
 cpp/src/dictionary/decode.cu                |   5 +-
 cpp/src/dictionary/encode.cu                |   5 +-
 cpp/src/dictionary/remove_keys.cu           |   6 +-
 cpp/src/dictionary/search.cu                |  11 +-
 cpp/src/dictionary/set_keys.cu              |   9 +-
 cpp/tests/CMakeLists.txt                    |   1 +
 cpp/tests/streams/dictionary_test.cpp       | 105 ++++++++++++++++++++
 12 files changed, 164 insertions(+), 27 deletions(-)
 create mode 100644 cpp/tests/streams/dictionary_test.cpp

diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index fb13eabe11a..959b785bf87 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,12 +53,14 @@ namespace dictionary {
  *
  * @param column The column to dictionary encode
  * @param indices_type The integer type to use for the indices
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Returns a dictionary column
  */
 std::unique_ptr<column> encode(
   column_view const& column,
   data_type indices_type              = data_type{type_id::UINT32},
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -72,11 +74,13 @@ std::unique_ptr<column> encode(
  * @endcode
  *
  * @param dictionary_column Existing dictionary column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with type matching the dictionary_column's keys
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index ed7a9c84693..1b72cf42acd 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,12 +37,14 @@ namespace dictionary {
  *
  * @param dictionary The dictionary to search for the key.
  * @param key The value to search for in the dictionary keyset.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned scalar's device memory.
- * @return Numeric scalar index value of the key within the dictionary
+ * @return Numeric scalar index value of the key within the dictionary.
  */
 std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index 2fcfb5e1f7c..81728e1ff73 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,13 +51,15 @@ namespace dictionary {
  * @throw cudf_logic_error if the new_keys contain nulls.
  *
  * @param dictionary_column Existing dictionary column.
- * @param new_keys New keys to incorporate into the dictionary_column
+ * @param new_keys New keys to incorporate into the dictionary_column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,13 +83,15 @@ std::unique_ptr<column> add_keys(
  * @throw cudf_logic_error if the keys_to_remove contain nulls.
  *
  * @param dictionary_column Existing dictionary column.
- * @param keys_to_remove The keys to remove from the dictionary_column
+ * @param keys_to_remove The keys to remove from the dictionary_column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -103,11 +107,13 @@ std::unique_ptr<column> remove_keys(
  * @endcode
  *
  * @param dictionary_column Existing dictionary column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -134,12 +140,14 @@ std::unique_ptr<column> remove_unused_keys(
  *
  * @param dictionary_column Existing dictionary column.
  * @param keys New keys to use for the output column. Must not contain nulls.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -149,11 +157,13 @@ std::unique_ptr<column> set_keys(
  * The result is a vector of new dictionaries with a common set of keys.
  *
  * @param input Dictionary columns to match keys.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary columns.
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index cc8cac35ef4..c0932b81dc3 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -944,8 +944,10 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   template <typename InputIterator>
   dictionary_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(
-      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end));
+    wrapped =
+      cudf::dictionary::encode(fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end),
+                               cudf::data_type{type_id::UINT32},
+                               cudf::test::get_default_stream());
   }
 
   /**
@@ -978,7 +980,9 @@ class dictionary_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(
-      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v));
+      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v),
+      cudf::data_type{type_id::UINT32},
+      cudf::test::get_default_stream());
   }
 
   /**
@@ -1134,7 +1138,9 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   template <typename StringsIterator>
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end));
+    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
+                                       cudf::data_type{type_id::UINT32},
+                                       cudf::test::get_default_stream());
   }
 
   /**
@@ -1169,7 +1175,9 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
     : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v));
+    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
+                                       cudf::data_type{type_id::UINT32},
+                                       cudf::test::get_default_stream());
   }
 
   /**
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index ab22c07e4d5..3973100aced 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -130,10 +130,11 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_keys(dictionary_column, keys, cudf::get_default_stream(), mr);
+  return detail::add_keys(dictionary_column, keys, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 01411d06b62..fdf546b5875 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,10 +65,11 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
 }  // namespace detail
 
 std::unique_ptr<column> decode(dictionary_column_view const& source,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::decode(source, cudf::get_default_stream(), mr);
+  return detail::decode(source, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index fe8e777b694..c92b57f0cac 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,10 +89,11 @@ data_type get_indices_type_for_size(size_type keys_size)
 
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input_column, indices_type, cudf::get_default_stream(), mr);
+  return detail::encode(input_column, indices_type, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 9fe4a63373b..86b70f1119b 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -195,17 +195,19 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_keys(dictionary_column, keys_to_remove, cudf::get_default_stream(), mr);
+  return detail::remove_keys(dictionary_column, keys_to_remove, stream, mr);
 }
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_unused_keys(dictionary_column, cudf::get_default_stream(), mr);
+  return detail::remove_unused_keys(dictionary_column, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 8e97a387780..e35aded1984 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,10 +79,8 @@ struct find_index_fn {
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
     auto keys_view   = column_device_view::create(input.keys(), stream);
-    auto iter        = thrust::equal_range(rmm::exec_policy(cudf::get_default_stream()),
-                                    keys_view->begin<Element>(),
-                                    keys_view->end<Element>(),
-                                    find_key);
+    auto iter        = thrust::equal_range(
+      rmm::exec_policy(stream), keys_view->begin<Element>(), keys_view->end<Element>(), find_key);
     return type_dispatcher(input.indices().type(),
                            dispatch_scalar_index{},
                            thrust::distance(keys_view->begin<Element>(), iter.first),
@@ -176,10 +174,11 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
 
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_index(dictionary, key, cudf::get_default_stream(), mr);
+  return detail::get_index(dictionary, key, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 36f5021d305..b49cf7850b1 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -241,17 +241,20 @@ std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_d
 
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::set_keys(dictionary_column, keys, cudf::get_default_stream(), mr);
+  return detail::set_keys(dictionary_column, keys, stream, mr);
 }
 
 std::vector<std::unique_ptr<column>> match_dictionaries(
-  cudf::host_span<dictionary_column_view const> input, rmm::mr::device_memory_resource* mr)
+  cudf::host_span<dictionary_column_view const> input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::match_dictionaries(input, cudf::get_default_stream(), mr);
+  return detail::match_dictionaries(input, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 956bfc7c27d..68ff6c54c99 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -629,6 +629,7 @@ ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
   testing
diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp
new file mode 100644
index 00000000000..f48e64c078e
--- /dev/null
+++ b/cpp/tests/streams/dictionary_test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/dictionary/search.hpp>
+#include <cudf/dictionary/update_keys.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class DictionaryTest : public cudf::test::BaseFixture {};
+
+TEST_F(DictionaryTest, Encode)
+{
+  cudf::test::fixed_width_column_wrapper<int> col({1, 2, 3, 4, 5});
+  cudf::data_type int32_type(cudf::type_id::UINT32);
+  cudf::column_view col_view = col;
+  cudf::dictionary::encode(col_view, int32_type, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, Decode)
+{
+  // keys = {0, 2, 6}, indices = {0, 1, 1, 2, 2}
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::dictionary::decode(dict_col_view, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, GetIndex)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::numeric_scalar<int32_t> key_scalar(2, true, cudf::test::get_default_stream());
+  cudf::dictionary::get_index(dict_col_view, key_scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, AddKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> new_keys_col({8, 9});
+  cudf::dictionary::add_keys(dict_col_view, new_keys_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, RemoveKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> keys_to_remove_col({2});
+  cudf::dictionary::remove_keys(
+    dict_col_view, keys_to_remove_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, RemoveUnsedKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::dictionary::remove_unused_keys(dict_col_view, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, SetKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> keys_col({2, 6});
+  cudf::dictionary::set_keys(dict_col_view, keys_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, MatchDictionaries)
+{
+  std::vector<int32_t> elements_a{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col_a(elements_a.begin(), elements_a.end());
+  cudf::dictionary_column_view dict_col_view_a = dict_col_a;
+
+  std::vector<int32_t> elements_b{1, 3, 4, 5, 5};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col_b(elements_b.begin(), elements_b.end());
+  cudf::dictionary_column_view dict_col_view_b = dict_col_b;
+
+  std::vector<cudf::dictionary_column_view> dicts = {dict_col_view_a, dict_col_view_b};
+
+  cudf::test::fixed_width_column_wrapper<int> keys_col({2, 6});
+  cudf::dictionary::match_dictionaries(dicts, cudf::test::get_default_stream());
+}

From 2e1a17d6519ea018921e35075306e01b4fdddf72 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 25 Sep 2023 15:53:55 -0700
Subject: [PATCH 213/230] Replace Python scalar conversions with libcudf
 (#14124)

This PR replaces the various Cython converters for different libcudf scalar types by using the new libcudf `[to|from]_arrow` overloads for scalars introduced in #14121. This change dramatically simplifies the Cython code and paves the way for implementation of a pylibcudf.Scalar object.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/14124
---
 python/cudf/cudf/_lib/cpp/interop.pxd |  11 +-
 python/cudf/cudf/_lib/interop.pyx     |  95 +++++-
 python/cudf/cudf/_lib/scalar.pyx      | 448 +++++---------------------
 python/cudf/cudf/tests/test_list.py   |   4 +-
 python/cudf/cudf/tests/test_struct.py |  35 +-
 python/cudf/cudf/utils/dtypes.py      |  18 --
 6 files changed, 210 insertions(+), 401 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/cpp/interop.pxd
index e81f0d617fb..88e9d83ee98 100644
--- a/python/cudf/cudf/_lib/cpp/interop.pxd
+++ b/python/cudf/cudf/_lib/cpp/interop.pxd
@@ -1,12 +1,13 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CTable
+from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
+from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 
@@ -24,6 +25,7 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
                                ) except +
 
     cdef unique_ptr[table] from_arrow(CTable input) except +
+    cdef unique_ptr[scalar] from_arrow(CScalar input) except +
 
     cdef cppclass column_metadata:
         column_metadata() except +
@@ -35,3 +37,8 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
         table_view input,
         vector[column_metadata] metadata,
     ) except +
+
+    cdef shared_ptr[CScalar] to_arrow(
+        const scalar& input,
+        column_metadata metadata,
+    ) except +
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 8fd2a409d90..639754fc54f 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -4,7 +4,14 @@ from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table
+from pyarrow.lib cimport (
+    CScalar,
+    CTable,
+    pyarrow_unwrap_scalar,
+    pyarrow_unwrap_table,
+    pyarrow_wrap_scalar,
+    pyarrow_wrap_table,
+)
 
 from cudf._lib.cpp.interop cimport (
     DLManagedTensor,
@@ -14,12 +21,22 @@ from cudf._lib.cpp.interop cimport (
     to_arrow as cpp_to_arrow,
     to_dlpack as cpp_to_dlpack,
 )
+from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport type_id
+from cudf._lib.cpp.wrappers.decimals cimport (
+    decimal32,
+    decimal64,
+    decimal128,
+    scale_type,
+)
+from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf.api.types import is_list_dtype, is_struct_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 
 
 def from_dlpack(dlpack_capsule):
@@ -182,3 +199,79 @@ def from_arrow(object input_table):
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
     return columns_from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def to_arrow_scalar(DeviceScalar source_scalar):
+    """Convert a scalar to a PyArrow scalar.
+
+    Parameters
+    ----------
+    source_scalar : the scalar to convert
+
+    Returns
+    -------
+    pyarrow.lib.Scalar
+    """
+    cdef vector[column_metadata] cpp_metadata = gather_metadata(
+        [("", source_scalar.dtype)]
+    )
+    cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr()
+
+    cdef shared_ptr[CScalar] cpp_arrow_scalar
+    with nogil:
+        cpp_arrow_scalar = cpp_to_arrow(
+            source_scalar_ptr[0], cpp_metadata[0]
+        )
+
+    return pyarrow_wrap_scalar(cpp_arrow_scalar)
+
+
+@acquire_spill_lock()
+def from_arrow_scalar(object input_scalar, output_dtype=None):
+    """Convert from PyArrow scalar to a cudf scalar.
+
+    Parameters
+    ----------
+    input_scalar : PyArrow scalar
+    output_dtype : output type to cast to, ignored except for decimals
+
+    Returns
+    -------
+    cudf._lib.DeviceScalar
+    """
+    cdef shared_ptr[CScalar] cpp_arrow_scalar = (
+        pyarrow_unwrap_scalar(input_scalar)
+    )
+    cdef unique_ptr[scalar] c_result
+
+    with nogil:
+        c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0]))
+
+    cdef type_id ctype = c_result.get().type().id()
+    if ctype == type_id.DECIMAL128:
+        if output_dtype is None:
+            # Decimals must be cast to the cudf dtype of the right width
+            raise ValueError(
+                "Decimal scalars must be constructed with a dtype"
+            )
+
+        if isinstance(output_dtype, Decimal32Dtype):
+            c_result.reset(
+                new fixed_point_scalar[decimal32](
+                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
+                    scale_type(-input_scalar.type.scale),
+                    c_result.get().is_valid()
+                )
+            )
+        elif isinstance(output_dtype, Decimal64Dtype):
+            c_result.reset(
+                new fixed_point_scalar[decimal64](
+                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
+                    scale_type(-input_scalar.type.scale),
+                    c_result.get().is_valid()
+                )
+            )
+        # Decimal128Dtype is a no-op, no conversion needed.
+
+    return DeviceScalar.from_unique_ptr(move(c_result), output_dtype)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 0407785b2d8..5ab286c5701 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -2,22 +2,13 @@
 
 cimport cython
 
-import decimal
+import copy
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 
-from libc.stdint cimport (
-    int8_t,
-    int16_t,
-    int32_t,
-    int64_t,
-    uint8_t,
-    uint16_t,
-    uint32_t,
-    uint64_t,
-)
+from libc.stdint cimport int64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -25,38 +16,22 @@ from libcpp.utility cimport move
 from rmm._lib.memory_resource cimport get_current_device_resource
 
 import cudf
-from cudf._lib.types import (
-    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
-    datetime_unit_map,
-    duration_unit_map,
-)
+from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
-from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
-from cudf._lib.interop import from_arrow, to_arrow
+from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar
 
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.scalar.scalar cimport (
     duration_scalar,
-    fixed_point_scalar,
     list_scalar,
-    numeric_scalar,
     scalar,
-    string_scalar,
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
 from cudf._lib.cpp.wrappers.durations cimport (
     duration_ms,
     duration_ns,
@@ -69,7 +44,21 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
     timestamp_us,
 )
-from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
+
+
+def _replace_nested(obj, check, replacement):
+    if isinstance(obj, list):
+        for i, item in enumerate(obj):
+            if check(item):
+                obj[i] = replacement
+            elif isinstance(item, (dict, list)):
+                _replace_nested(item, check, replacement)
+    elif isinstance(obj, dict):
+        for k, v in obj.items():
+            if check(v):
+                obj[k] = replacement
+            elif isinstance(v, (dict, list)):
+                _replace_nested(v, check, replacement)
 
 
 # The DeviceMemoryResource attribute could be released prematurely
@@ -97,61 +86,61 @@ cdef class DeviceScalar:
             A NumPy dtype.
         """
         self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
-        self._set_value(value, self._dtype)
-
-    def _set_value(self, value, dtype):
-        # IMPORTANT: this should only ever be called from __init__
-        valid = not _is_null_host_scalar(value)
-
-        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            _set_decimal_from_scalar(
-                self.c_value, value, dtype, valid)
-        elif isinstance(dtype, cudf.ListDtype):
-            _set_list_from_pylist(
-                self.c_value, value, dtype, valid)
-        elif isinstance(dtype, cudf.StructDtype):
-            _set_struct_from_pydict(self.c_value, value, dtype, valid)
+
+        if cudf.utils.utils.is_na_like(value):
+            value = None
+        else:
+            # TODO: For now we always deepcopy the input value to avoid
+            # overwriting the input values when replacing nulls. Since it's
+            # just host values it's not that expensive, but we could consider
+            # alternatives.
+            value = copy.deepcopy(value)
+            _replace_nested(value, cudf.utils.utils.is_na_like, None)
+
+        if isinstance(dtype, cudf.core.dtypes._BaseDtype):
+            pa_type = dtype.to_arrow()
         elif pd.api.types.is_string_dtype(dtype):
-            _set_string_from_np_string(self.c_value, value, valid)
-        elif pd.api.types.is_numeric_dtype(dtype):
-            _set_numeric_from_np_scalar(self.c_value,
-                                        value,
-                                        dtype,
-                                        valid)
-        elif pd.api.types.is_datetime64_dtype(dtype):
-            _set_datetime64_from_np_scalar(
-                self.c_value, value, dtype, valid
-            )
-        elif pd.api.types.is_timedelta64_dtype(dtype):
-            _set_timedelta64_from_np_scalar(
-                self.c_value, value, dtype, valid
-            )
+            # Have to manually convert object types, which we use internally
+            # for strings but pyarrow only supports as unicode 'U'
+            pa_type = pa.string()
         else:
-            raise ValueError(
-                f"Cannot convert value of type "
-                f"{type(value).__name__} to cudf scalar"
-            )
+            pa_type = pa.from_numpy_dtype(dtype)
+
+        pa_scalar = pa.scalar(value, type=pa_type)
+
+        # Note: This factory-like behavior in __init__ will be removed when
+        # migrating to pylibcudf.
+        cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype)
+        self.c_value.swap(obj.c_value)
 
     def _to_host_scalar(self):
-        if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype):
-            result = _get_py_decimal_from_fixed_point(self.c_value)
-        elif cudf.api.types.is_struct_dtype(self.dtype):
-            result = _get_py_dict_from_struct(self.c_value, self.dtype)
-        elif cudf.api.types.is_list_dtype(self.dtype):
-            result = _get_py_list_from_list(self.c_value, self.dtype)
-        elif pd.api.types.is_string_dtype(self.dtype):
-            result = _get_py_string_from_string(self.c_value)
-        elif pd.api.types.is_numeric_dtype(self.dtype):
-            result = _get_np_scalar_from_numeric(self.c_value)
-        elif pd.api.types.is_datetime64_dtype(self.dtype):
-            result = _get_np_scalar_from_timestamp64(self.c_value)
-        elif pd.api.types.is_timedelta64_dtype(self.dtype):
-            result = _get_np_scalar_from_timedelta64(self.c_value)
+        is_datetime = self.dtype.kind == "M"
+        is_timedelta = self.dtype.kind == "m"
+
+        null_type = NaT if is_datetime or is_timedelta else NA
+
+        ps = to_arrow_scalar(self)
+        if not ps.is_valid:
+            return null_type
+
+        # TODO: The special handling of specific types below does not currently
+        # extend to nested types containing those types (e.g. List[timedelta]
+        # where the timedelta would overflow). We should eventually account for
+        # those cases, but that will require more careful consideration of how
+        # to traverse the contents of the nested data.
+        if is_datetime or is_timedelta:
+            time_unit, _ = np.datetime_data(self.dtype)
+            # Cast to int64 to avoid overflow
+            ps_cast = ps.cast('int64').as_py()
+            out_type = np.datetime64 if is_datetime else np.timedelta64
+            ret = out_type(ps_cast, time_unit)
+        elif cudf.api.types.is_numeric_dtype(self.dtype):
+            ret = ps.type.to_pandas_dtype()(ps.as_py())
         else:
-            raise ValueError(
-                "Could not convert cudf::scalar to a Python value"
-            )
-        return result
+            ret = ps.as_py()
+
+        _replace_nested(ret, lambda item: item is None, NA)
+        return ret
 
     @property
     def dtype(self):
@@ -236,42 +225,9 @@ cdef class DeviceScalar:
         return s
 
 
-cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True):
-    value = value if valid else ""
-    s.reset(new string_scalar(value.encode(), valid))
-
-
-cdef _set_numeric_from_np_scalar(unique_ptr[scalar]& s,
-                                 object value,
-                                 object dtype,
-                                 bool valid=True):
-    value = value if valid else 0
-    if dtype == "int8":
-        s.reset(new numeric_scalar[int8_t](value, valid))
-    elif dtype == "int16":
-        s.reset(new numeric_scalar[int16_t](value, valid))
-    elif dtype == "int32":
-        s.reset(new numeric_scalar[int32_t](value, valid))
-    elif dtype == "int64":
-        s.reset(new numeric_scalar[int64_t](value, valid))
-    elif dtype == "uint8":
-        s.reset(new numeric_scalar[uint8_t](value, valid))
-    elif dtype == "uint16":
-        s.reset(new numeric_scalar[uint16_t](value, valid))
-    elif dtype == "uint32":
-        s.reset(new numeric_scalar[uint32_t](value, valid))
-    elif dtype == "uint64":
-        s.reset(new numeric_scalar[uint64_t](value, valid))
-    elif dtype == "float32":
-        s.reset(new numeric_scalar[float](value, valid))
-    elif dtype == "float64":
-        s.reset(new numeric_scalar[double](value, valid))
-    elif dtype == "bool":
-        s.reset(new numeric_scalar[bool](<bool>value, valid))
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-
+# TODO: Currently the only uses of this function and the one below are in
+# _create_proxy_nat_scalar. See if that code path can be simplified to excise
+# or at least simplify these implementations.
 cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s,
                                     object value,
                                     object dtype,
@@ -324,253 +280,6 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s,
     else:
         raise ValueError(f"dtype not supported: {dtype}")
 
-cdef _set_decimal_from_scalar(unique_ptr[scalar]& s,
-                              object value,
-                              object dtype,
-                              bool valid=True):
-    value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0
-    if isinstance(dtype, cudf.Decimal64Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal64](
-                <int64_t>np.int64(value), scale_type(-dtype.scale), valid
-            )
-        )
-    elif isinstance(dtype, cudf.Decimal32Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal32](
-                <int32_t>np.int32(value), scale_type(-dtype.scale), valid
-            )
-        )
-    elif isinstance(dtype, cudf.Decimal128Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal128](
-                <libcudf_types.int128>value, scale_type(-dtype.scale), valid
-            )
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
-                             object value,
-                             object dtype,
-                             bool valid=True):
-    arrow_schema = dtype.to_arrow()
-    columns = [str(i) for i in range(len(arrow_schema))]
-    if valid:
-        pyarrow_table = pa.Table.from_arrays(
-            [
-                pa.array([value[f.name]], from_pandas=True, type=f.type)
-                for f in arrow_schema
-            ],
-            names=columns
-        )
-    else:
-        pyarrow_table = pa.Table.from_arrays(
-            [
-                pa.array([NA], from_pandas=True, type=f.type)
-                for f in arrow_schema
-            ],
-            names=columns
-        )
-
-    data = from_arrow(pyarrow_table)
-    cdef table_view struct_view = table_view_from_columns(data)
-
-    s.reset(
-        new struct_scalar(struct_view, valid)
-    )
-
-cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype):
-    if not s.get()[0].is_valid():
-        return NA
-
-    cdef table_view struct_table_view = (<struct_scalar*>s.get()).view()
-    columns = columns_from_table_view(struct_table_view, None)
-    struct_col = cudf.core.column.build_struct_column(
-        names=dtype.fields.keys(),
-        children=tuple(columns),
-        size=1,
-    )
-    table = to_arrow([struct_col], [("None", dtype)])
-    python_dict = table.to_pydict()["None"][0]
-    return {k: _nested_na_replace([python_dict[k]])[0] for k in python_dict}
-
-cdef _set_list_from_pylist(unique_ptr[scalar]& s,
-                           object value,
-                           object dtype,
-                           bool valid=True):
-
-    value = value if valid else [NA]
-    cdef Column col
-    if isinstance(dtype.element_type, ListDtype):
-        pa_type = dtype.element_type.to_arrow()
-    else:
-        pa_type = dtype.to_arrow().value_type
-    col = cudf.core.column.as_column(
-        pa.array(value, from_pandas=True, type=pa_type)
-    )
-    cdef column_view col_view = col.view()
-    s.reset(
-        new list_scalar(col_view, valid)
-    )
-
-
-cdef _get_py_list_from_list(unique_ptr[scalar]& s, dtype):
-
-    if not s.get()[0].is_valid():
-        return NA
-
-    cdef column_view list_col_view = (<list_scalar*>s.get()).view()
-    cdef Column element_col = Column.from_column_view(list_col_view, None)
-
-    arrow_obj = to_arrow([element_col], [("None", dtype.element_type)])["None"]
-
-    result = arrow_obj.to_pylist()
-    return _nested_na_replace(result)
-
-
-cdef _get_py_string_from_string(unique_ptr[scalar]& s):
-    if not s.get()[0].is_valid():
-        return NA
-    return (<string_scalar*>s.get())[0].to_string().decode()
-
-
-cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s):
-    cdef scalar* s_ptr = s.get()
-    if not s_ptr[0].is_valid():
-        return NA
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.INT8:
-        return np.int8((<numeric_scalar[int8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT16:
-        return np.int16((<numeric_scalar[int16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT32:
-        return np.int32((<numeric_scalar[int32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT64:
-        return np.int64((<numeric_scalar[int64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT8:
-        return np.uint8((<numeric_scalar[uint8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT16:
-        return np.uint16((<numeric_scalar[uint16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT32:
-        return np.uint32((<numeric_scalar[uint32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT64:
-        return np.uint64((<numeric_scalar[uint64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.FLOAT32:
-        return np.float32((<numeric_scalar[float]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.FLOAT64:
-        return np.float64((<numeric_scalar[double]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.BOOL8:
-        return np.bool_((<numeric_scalar[bool]*>s_ptr)[0].value())
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-
-cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
-    cdef scalar* s_ptr = s.get()
-    if not s_ptr[0].is_valid():
-        return NA
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.DECIMAL64:
-        rep_val = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.type_id.DECIMAL32:
-        rep_val = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.type_id.DECIMAL128:
-        rep_val = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s):
-
-    cdef scalar* s_ptr = s.get()
-
-    if not s_ptr[0].is_valid():
-        return NaT
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.TIMESTAMP_SECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MILLISECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MICROSECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_NANOSECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-
-cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
-
-    cdef scalar* s_ptr = s.get()
-
-    if not s_ptr[0].is_valid():
-        return NaT
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.DURATION_SECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_s]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_MILLISECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_ms]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_MICROSECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_us]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_NANOSECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_ns]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
 
 def as_device_scalar(val, dtype=None):
     if isinstance(val, (cudf.Scalar, DeviceScalar)):
@@ -607,16 +316,3 @@ def _create_proxy_nat_scalar(dtype):
         return result
     else:
         raise TypeError('NAT only valid for datetime and timedelta')
-
-
-def _nested_na_replace(input_list):
-    '''
-    Replace `None` with `cudf.NA` in the result of
-    `__getitem__` calls to list type columns
-    '''
-    for idx, value in enumerate(input_list):
-        if isinstance(value, list):
-            _nested_na_replace(value)
-        elif value is None:
-            input_list[idx] = NA
-    return input_list
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 5dd58d8a875..ac10dd97c56 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -895,14 +895,14 @@ def test_memory_usage():
     "data, idx",
     [
         (
-            [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": None}]],
+            [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]],
             0,
         ),
         (
             [
                 [
                     {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"},
-                    {"f1": "sf12", "f2": None},
+                    {"f1": "sf12", "f2": NA},
                 ]
             ],
             0,
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index a3593e55b97..ce6dc587320 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -150,9 +150,7 @@ def test_struct_setitem(data, item):
     "data",
     [
         {"a": 1, "b": "rapids", "c": [1, 2, 3, 4]},
-        {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA},
         {"a": "Hello"},
-        {"b": [], "c": [1, 2, 3]},
     ],
 )
 def test_struct_scalar_host_construction(data):
@@ -161,6 +159,39 @@ def test_struct_scalar_host_construction(data):
     assert list(slr.device_value.value.values()) == list(data.values())
 
 
+@pytest.mark.parametrize(
+    ("data", "dtype"),
+    [
+        (
+            {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA},
+            cudf.StructDtype(
+                {
+                    "a": np.dtype(np.int64),
+                    "b": np.dtype(np.str_),
+                    "c": cudf.ListDtype(np.dtype(np.int64)),
+                    "d": np.dtype(np.int64),
+                }
+            ),
+        ),
+        (
+            {"b": [], "c": [1, 2, 3]},
+            cudf.StructDtype(
+                {
+                    "b": cudf.ListDtype(np.dtype(np.int64)),
+                    "c": cudf.ListDtype(np.dtype(np.int64)),
+                }
+            ),
+        ),
+    ],
+)
+def test_struct_scalar_host_construction_no_dtype_inference(data, dtype):
+    # cudf cannot infer the dtype of the scalar when it contains only nulls or
+    # is empty.
+    slr = cudf.Scalar(data, dtype=dtype)
+    assert slr.value == data
+    assert list(slr.device_value.value.values()) == list(data.values())
+
+
 def test_struct_scalar_null():
     slr = cudf.Scalar(cudf.NA, dtype=StructDtype)
     assert slr.device_value.value is cudf.NA
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1b94db75340..73ea8e2cfc4 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -463,24 +463,6 @@ def _get_nan_for_dtype(dtype):
         return np.float64("nan")
 
 
-def _decimal_to_int64(decimal: Decimal) -> int:
-    """
-    Scale a Decimal such that the result is the integer
-    that would result from removing the decimal point.
-
-    Examples
-    --------
-    >>> _decimal_to_int64(Decimal('1.42'))
-    142
-    >>> _decimal_to_int64(Decimal('0.0042'))
-    42
-    >>> _decimal_to_int64(Decimal('-1.004201'))
-    -1004201
-
-    """
-    return int(f"{decimal:0f}".replace(".", ""))
-
-
 def get_allowed_combinations_for_operator(dtype_l, dtype_r, op):
     error = TypeError(
         f"{op} not supported between {dtype_l} and {dtype_r} scalars"

From daea8c8bc37ec53b7347857a3b6795bcb0ad86ff Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Tue, 26 Sep 2023 09:11:31 -0400
Subject: [PATCH 214/230] Disable `Recently Updated` Check (#14193)

This check occasionally hangs for `cudf` for unknown reasons.

Upon checking the application logs, the GitHub API seems to be returning responses that aren't helpful in troubleshooting the problem.

Therefore, it's probably best to just remove the check to avoid confusion.

[skip ci]

Authors:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 .github/ops-bot.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 9a0b4155035..d2ca78924e1 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,4 +5,3 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-recently_updated: true

From 3196f6c36140962818aa8d12fe4fbd0dc522e31e Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 26 Sep 2023 11:54:18 -0500
Subject: [PATCH 215/230] update rmm tag path (#14195)

PR updates the download path of the `rmm` tag used in `build_docs.sh` following the re-arrangement of the docs directories.

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14195
---
 ci/build_docs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 1ed047a500b..9149b5e6bfe 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -30,7 +30,7 @@ export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/${RAPIDS_VERSION_NUMBER}/html/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_NUMBER}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"

From a9ec350217331979359c50ea1da9457e9973f719 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 26 Sep 2023 14:32:04 -0500
Subject: [PATCH 216/230] Fix pytorch related pytest (#14198)

Calling `cudf.Index([])` results in `str` dtype `Index`. This PR fixes an issue with a pytorch related pytest by explicitly passing a `float64` dtype.

xref: https://github.com/rapidsai/cudf/pull/14116

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/14198
---
 python/cudf/cudf/tests/test_cuda_array_interface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index e81f4ec795a..848c77206b2 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import types
 from contextlib import ExitStack as does_not_raise
@@ -193,7 +193,7 @@ def test_cuda_array_interface_pytorch():
 
     assert_eq(got, cudf.Series(buffer, dtype=np.bool_))
 
-    index = cudf.Index([])
+    index = cudf.Index([], dtype="float64")
     tensor = torch.tensor(index)
     got = cudf.Index(tensor)
     assert_eq(got, index)

From 030c0f4995ec458fcfc00a4ebb3aa8bccb2b27a0 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 26 Sep 2023 12:42:12 -0700
Subject: [PATCH 217/230] Refactor `contains_table` with cuco::static_set
 (#14064)

Contributes to #12261

This PR refactors `contains_table` to use the new `cuco::static_set` data structure. It also adds a `contains_table` benchmark to track the performance before and after this work.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14064
---
 cpp/benchmarks/CMakeLists.txt                 |   2 +-
 .../{contains.cpp => contains_scalar.cpp}     |   0
 cpp/benchmarks/search/contains_table.cpp      |  73 ++++
 cpp/include/cudf/detail/search.hpp            |   2 +
 cpp/src/search/contains_table.cu              | 319 +++++++++---------
 5 files changed, 229 insertions(+), 167 deletions(-)
 rename cpp/benchmarks/search/{contains.cpp => contains_scalar.cpp} (100%)
 create mode 100644 cpp/benchmarks/search/contains_table.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5e7862f4b3b..cd6b3cfdc03 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -173,7 +173,7 @@ ConfigureBench(ITERATOR_BENCH iterator/iterator.cu)
 # ##################################################################################################
 # * search benchmark ------------------------------------------------------------------------------
 ConfigureBench(SEARCH_BENCH search/search.cpp)
-ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp)
+ConfigureNVBench(SEARCH_NVBENCH search/contains_scalar.cpp search/contains_table.cpp)
 
 # ##################################################################################################
 # * sort benchmark --------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/search/contains.cpp b/cpp/benchmarks/search/contains_scalar.cpp
similarity index 100%
rename from cpp/benchmarks/search/contains.cpp
rename to cpp/benchmarks/search/contains_scalar.cpp
diff --git a/cpp/benchmarks/search/contains_table.cpp b/cpp/benchmarks/search/contains_table.cpp
new file mode 100644
index 00000000000..17702d0741c
--- /dev/null
+++ b/cpp/benchmarks/search/contains_table.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/search.hpp>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+auto constexpr num_unique_elements = 1000;
+
+template <typename Type>
+static void nvbench_contains_table(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const size               = state.get_int64("table_size");
+  auto const dtype              = cudf::type_to_id<Type>();
+  double const null_probability = state.get_float64("null_probability");
+
+  auto builder = data_profile_builder().null_probability(null_probability);
+  if (dtype == cudf::type_id::LIST) {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, num_unique_elements)
+      .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_unique_elements)
+      .list_depth(1);
+  } else {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, num_unique_elements);
+  }
+
+  auto const haystack = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 0);
+  auto const needles = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 1);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const stream_view = rmm::cuda_stream_view{launch.get_stream()};
+    [[maybe_unused]] auto const result =
+      cudf::detail::contains(haystack->view(),
+                             needles->view(),
+                             cudf::null_equality::EQUAL,
+                             cudf::nan_equality::ALL_EQUAL,
+                             stream_view,
+                             rmm::mr::get_current_device_resource());
+  });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(nvbench_contains_table,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
+  .set_name("contains_table")
+  .set_type_axes_names({"type"})
+  .add_float64_axis("null_probability", {0.0, 0.1})
+  .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 4c4ad7834f4..4277baf3edd 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -81,6 +81,8 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * output   = { false, true, true }
  * @endcode
  *
+ * @throws cudf::logic_error If column types of haystack and needles don't match
+ *
  * @param haystack The table containing the search space
  * @param needles A table of rows whose existence to check in the search space
  * @param compare_nulls Control whether nulls should be compared as equal or not
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index e37f0686ac3..43624ba691d 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -26,7 +26,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <cuco/static_map.cuh>
+#include <cuco/static_set.cuh>
 
 #include <type_traits>
 
@@ -37,69 +37,59 @@ namespace {
 using cudf::experimental::row::lhs_index_type;
 using cudf::experimental::row::rhs_index_type;
 
-using static_map = cuco::static_map<lhs_index_type,
-                                    size_type,
-                                    cuda::thread_scope_device,
-                                    rmm::mr::stream_allocator_adaptor<default_allocator<char>>>;
-
 /**
- * @brief Check if the given type `T` is a strong index type (i.e., `lhs_index_type` or
- * `rhs_index_type`).
- *
- * @return A boolean value indicating if `T` is a strong index type
+ * @brief An hasher adapter wrapping both haystack hasher and needles hasher
  */
-template <typename T>
-constexpr auto is_strong_index_type()
-{
-  return std::is_same_v<T, lhs_index_type> || std::is_same_v<T, rhs_index_type>;
-}
+template <typename HaystackHasher, typename NeedleHasher>
+struct hasher_adapter {
+  hasher_adapter(HaystackHasher const& haystack_hasher, NeedleHasher const& needle_hasher)
+    : _haystack_hasher{haystack_hasher}, _needle_hasher{needle_hasher}
+  {
+  }
 
-/**
- * @brief An adapter functor to support strong index types for row hasher that must be operating on
- * `cudf::size_type`.
- */
-template <typename Hasher>
-struct strong_index_hasher_adapter {
-  strong_index_hasher_adapter(Hasher const& hasher) : _hasher{hasher} {}
+  __device__ constexpr auto operator()(lhs_index_type idx) const noexcept
+  {
+    return _haystack_hasher(static_cast<size_type>(idx));
+  }
 
-  template <typename T, CUDF_ENABLE_IF(is_strong_index_type<T>())>
-  __device__ constexpr auto operator()(T const idx) const noexcept
+  __device__ constexpr auto operator()(rhs_index_type idx) const noexcept
   {
-    return _hasher(static_cast<size_type>(idx));
+    return _needle_hasher(static_cast<size_type>(idx));
   }
 
  private:
-  Hasher const _hasher;
+  HaystackHasher const _haystack_hasher;
+  NeedleHasher const _needle_hasher;
 };
 
 /**
- * @brief An adapter functor to support strong index type for table row comparator that must be
- * operating on `cudf::size_type`.
+ * @brief An comparator adapter wrapping both self comparator and two table comparator
  */
-template <typename Comparator>
-struct strong_index_comparator_adapter {
-  strong_index_comparator_adapter(Comparator const& comparator) : _comparator{comparator} {}
-
-  template <typename T,
-            typename U,
-            CUDF_ENABLE_IF(is_strong_index_type<T>() && is_strong_index_type<U>())>
-  __device__ constexpr auto operator()(T const lhs_index, U const rhs_index) const noexcept
+template <typename SelfEqual, typename TwoTableEqual>
+struct comparator_adapter {
+  comparator_adapter(SelfEqual const& self_equal, TwoTableEqual const& two_table_equal)
+    : _self_equal{self_equal}, _two_table_equal{two_table_equal}
+  {
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type lhs_index,
+                                       lhs_index_type rhs_index) const noexcept
   {
     auto const lhs = static_cast<size_type>(lhs_index);
     auto const rhs = static_cast<size_type>(rhs_index);
 
-    if constexpr (std::is_same_v<T, U> || std::is_same_v<T, lhs_index_type>) {
-      return _comparator(lhs, rhs);
-    } else {
-      // Here we have T == rhs_index_type.
-      // This is when the indices are provided in wrong order for two table comparator, so we need
-      // to switch them back to the right order before calling the underlying comparator.
-      return _comparator(rhs, lhs);
-    }
+    return _self_equal(lhs, rhs);
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type lhs_index,
+                                       rhs_index_type rhs_index) const noexcept
+  {
+    return _two_table_equal(lhs_index, rhs_index);
   }
 
  private:
-  Comparator const _comparator;
+  SelfEqual const _self_equal;
+  TwoTableEqual const _two_table_equal;
 };
 
 /**
@@ -134,38 +124,62 @@ std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view
 }
 
 /**
- * @brief Invoke an `operator()` template with a row equality comparator based on the specified
- * `compare_nans` parameter.
+ * @brief Invokes the given `func` with desired comparators based on the specified `compare_nans`
+ * parameter
+ *
+ * @tparam HasNested Flag indicating whether there are nested columns in haystack or needles
+ * @tparam Hasher Type of device hash function
+ * @tparam Func Type of the helper function doing `contains` check
  *
- * @param compare_nans The flag to specify whether NaNs should be compared equal or not
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param haystack_has_nulls Flag indicating whether haystack has nulls or not
+ * @param has_any_nulls Flag indicating whether there are nested nulls is either haystack or needles
+ * @param self_equal Self table comparator
+ * @param two_table_equal Two table comparator
+ * @param d_hasher Device hash functor
  * @param func The input functor to invoke
  */
-template <typename Func>
-void dispatch_nan_comparator(nan_equality compare_nans, Func&& func)
+template <bool HasNested, typename Hasher, typename Func>
+void dispatch_nan_comparator(
+  null_equality compare_nulls,
+  nan_equality compare_nans,
+  bool haystack_has_nulls,
+  bool has_any_nulls,
+  cudf::experimental::row::equality::self_comparator self_equal,
+  cudf::experimental::row::equality::two_table_comparator two_table_equal,
+  Hasher const& d_hasher,
+  Func&& func)
 {
+  // Distinguish probing scheme CG sizes between nested and flat types for better performance
+  auto const probing_scheme = [&]() {
+    if constexpr (HasNested) {
+      return cuco::experimental::linear_probing<4, Hasher>{d_hasher};
+    } else {
+      return cuco::experimental::linear_probing<1, Hasher>{d_hasher};
+    }
+  }();
+
   if (compare_nans == nan_equality::ALL_EQUAL) {
     using nan_equal_comparator =
       cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    func(nan_equal_comparator{});
+    auto const d_self_equal = self_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_equal_comparator{});
+    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_equal_comparator{});
+    func(d_self_equal, d_two_table_equal, probing_scheme);
   } else {
     using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    func(nan_unequal_comparator{});
+    auto const d_self_equal      = self_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_unequal_comparator{});
+    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_unequal_comparator{});
+    func(d_self_equal, d_two_table_equal, probing_scheme);
   }
 }
 
 }  // namespace
 
-/**
- * @brief Check if rows in the given `needles` table exist in the `haystack` table.
- *
- * @param haystack The table containing the search space
- * @param needles A table of rows whose existence to check in the search space
- * @param compare_nulls Control whether nulls should be compared as equal or not
- * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
- */
 rmm::device_uvector<bool> contains(table_view const& haystack,
                                    table_view const& needles,
                                    null_equality compare_nulls,
@@ -173,124 +187,97 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  auto map = static_map(compute_hash_table_size(haystack.num_rows()),
-                        cuco::empty_key{lhs_index_type{std::numeric_limits<size_type>::max()}},
-                        cuco::empty_value{detail::JoinNoneValue},
-                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value());
+  CUDF_EXPECTS(cudf::have_same_types(haystack, needles), "Column types mismatch");
 
   auto const haystack_has_nulls = has_nested_nulls(haystack);
   auto const needles_has_nulls  = has_nested_nulls(needles);
   auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
 
+  auto const preprocessed_needles =
+    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
   auto const preprocessed_haystack =
     cudf::experimental::row::equality::preprocessed_table::create(haystack, stream);
-  // Insert row indices of the haystack table as map keys.
-  {
-    auto const haystack_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0},
-      [] __device__(auto const idx) { return cuco::make_pair(lhs_index_type{idx}, 0); });
-
-    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
-    auto const d_hasher =
-      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
-
-    auto const comparator =
-      cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
-
-    // If the haystack table has nulls but they are compared unequal, don't insert them.
-    // Otherwise, it was known to cause performance issue:
-    // - https://github.com/rapidsai/cudf/pull/6943
-    // - https://github.com/rapidsai/cudf/pull/8277
-    if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
-      auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
-      auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
-
-      auto const insert_map = [&](auto const value_comp) {
-        if (cudf::detail::has_nested_columns(haystack)) {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert_if(haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        thrust::counting_iterator<size_type>(0),  // stencil
-                        row_is_valid{row_bitmask_ptr},
-                        d_hasher,
-                        d_eqcomp,
-                        stream.value());
-        } else {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert_if(haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        thrust::counting_iterator<size_type>(0),  // stencil
-                        row_is_valid{row_bitmask_ptr},
-                        d_hasher,
-                        d_eqcomp,
-                        stream.value());
-        }
-      };
-
-      // Insert only rows that do not have any null at any level.
-      dispatch_nan_comparator(compare_nans, insert_map);
-    } else {  // haystack_doesn't_have_nulls || compare_nulls == null_equality::EQUAL
-      auto const insert_map = [&](auto const value_comp) {
-        if (cudf::detail::has_nested_columns(haystack)) {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert(
-            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
-        } else {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert(
-            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
-        }
-      };
-
-      dispatch_nan_comparator(compare_nans, insert_map);
-    }
-  }
+
+  auto const haystack_hasher   = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
+  auto const d_haystack_hasher = haystack_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
+  auto const needle_hasher     = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
+  auto const d_needle_hasher   = needle_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
+  auto const d_hasher          = hasher_adapter{d_haystack_hasher, d_needle_hasher};
+
+  auto const self_equal = cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
+  auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
+    preprocessed_haystack, preprocessed_needles);
 
   // The output vector.
   auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
 
-  auto const preprocessed_needles =
-    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
-  // Check existence for each row of the needles table in the haystack table.
-  {
-    auto const needles_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0}, [] __device__(auto const idx) { return rhs_index_type{idx}; });
-
-    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
-    auto const d_hasher =
-      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
-
-    auto const comparator = cudf::experimental::row::equality::two_table_comparator(
-      preprocessed_haystack, preprocessed_needles);
-
-    auto const check_contains = [&](auto const value_comp) {
-      if (cudf::detail::has_nested_columns(haystack) or cudf::detail::has_nested_columns(needles)) {
-        auto const d_eqcomp =
-          comparator.equal_to<true>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-        map.contains(needles_it,
-                     needles_it + needles.num_rows(),
-                     contained.begin(),
-                     d_hasher,
-                     d_eqcomp,
-                     stream.value());
+  auto const haystack_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(auto idx) { return lhs_index_type{idx}; });
+  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(auto idx) { return rhs_index_type{idx}; });
+
+  auto const helper_func =
+    [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
+      auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
+
+      auto set = cuco::experimental::static_set{
+        cuco::experimental::extent{compute_hash_table_size(haystack.num_rows())},
+        cuco::empty_key{lhs_index_type{-1}},
+        d_equal,
+        probing_scheme,
+        detail::hash_table_allocator_type{default_allocator<lhs_index_type>{}, stream},
+        stream.value()};
+
+      if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+        auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
+        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+
+        // If the haystack table has nulls but they are compared unequal, don't insert them.
+        // Otherwise, it was known to cause performance issue:
+        // - https://github.com/rapidsai/cudf/pull/6943
+        // - https://github.com/rapidsai/cudf/pull/8277
+        set.insert_if_async(haystack_iter,
+                            haystack_iter + haystack.num_rows(),
+                            thrust::counting_iterator<size_type>(0),  // stencil
+                            row_is_valid{row_bitmask_ptr},
+                            stream.value());
       } else {
-        auto const d_eqcomp =
-          comparator.equal_to<false>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-        map.contains(needles_it,
-                     needles_it + needles.num_rows(),
-                     contained.begin(),
-                     d_hasher,
-                     d_eqcomp,
-                     stream.value());
+        set.insert_async(haystack_iter, haystack_iter + haystack.num_rows(), stream.value());
+      }
+
+      if (needles_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+        auto const bitmask_buffer_and_ptr = build_row_bitmask(needles, stream);
+        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+        set.contains_if_async(needles_iter,
+                              needles_iter + needles.num_rows(),
+                              thrust::counting_iterator<size_type>(0),  // stencil
+                              row_is_valid{row_bitmask_ptr},
+                              contained.begin(),
+                              stream.value());
+      } else {
+        set.contains_async(
+          needles_iter, needles_iter + needles.num_rows(), contained.begin(), stream.value());
       }
     };
 
-    dispatch_nan_comparator(compare_nans, check_contains);
+  if (cudf::detail::has_nested_columns(haystack)) {
+    dispatch_nan_comparator<true>(compare_nulls,
+                                  compare_nans,
+                                  haystack_has_nulls,
+                                  has_any_nulls,
+                                  self_equal,
+                                  two_table_equal,
+                                  d_hasher,
+                                  helper_func);
+  } else {
+    dispatch_nan_comparator<false>(compare_nulls,
+                                   compare_nans,
+                                   haystack_has_nulls,
+                                   has_any_nulls,
+                                   self_equal,
+                                   two_table_equal,
+                                   d_hasher,
+                                   helper_func);
   }
 
   return contained;

From b25b292f7f97cbb681f0244e1a20b30a925145a1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 26 Sep 2023 18:53:43 -0400
Subject: [PATCH 218/230] Add nvtext::tokenize_with_vocabulary API (#13930)

Adds tokenize with vocabulary APIs to libcudf.

```
struct tokenize_vocabulary{ ... };

std::unique_ptr<tokenize_vocabulary> load_vocabulary(
  cudf::strings_column_view const& input,
  rmm::cuda_stream_view stream,
  rmm::mr::device_memory_resource* mr);

std::unique_ptr<cudf::column> tokenize_with_vocabulary(
  cudf::strings_column_view const& input,
  tokenize_vocabulary const& vocabulary,
  cudf::string_scalar const& delimiter,
  cudf::size_type default_id,
  rmm::cuda_stream_view stream,
  rmm::mr::device_memory_resource* mr);
```

Returns an integer lists column replacing individual tokens as resolved from the `input` using `delimiter` with id values which are the row indices of the input `vocabulary` column.
If a token is not found in the `vocabulary` it is assigned `default_id`.
The vocabulary can be loaded once using the `nvtext::load_vocabulary()` API and then used in repeated calls to `nvtext::tokenize_with_vocabulary()` with different input columns.

Python interface is new class `TokenizeVocabulary` which can be used like the following:
```
>>> import cudf
>>> from cudf.core.tokenize_vocabulary import TokenizeVocabulary
>>> words = cudf.Series( ['brown', 'the', 'dog', 'jumps'] )
>>> vocab = TokenizeVocabulary(words)
>>> s = cudf.Series( ['the brown dog jumps over the brown cat'] )
>>> print(vocab(s))
0    [1, 0, 2, 3, -1, 1, 0, -1]
dtype: list
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - https://github.com/nvdbaranec
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13930
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/nvtext/tokenize.hpp               |  78 ++++++
 cpp/src/text/vocabulary_tokenize.cu           | 257 ++++++++++++++++++
 cpp/tests/text/tokenize_tests.cpp             |  93 +++++--
 python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd |  17 +-
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     |  40 ++-
 python/cudf/cudf/_lib/strings/__init__.py     |   1 +
 python/cudf/cudf/core/tokenize_vocabulary.py  |  48 ++++
 .../cudf/cudf/tests/text/test_text_methods.py |  59 ++++
 9 files changed, 574 insertions(+), 20 deletions(-)
 create mode 100644 cpp/src/text/vocabulary_tokenize.cu
 create mode 100644 python/cudf/cudf/core/tokenize_vocabulary.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a84f7bd5224..9656bc40fd7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -618,6 +618,7 @@ add_library(
   src/text/subword/subword_tokenize.cu
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu
+  src/text/vocabulary_tokenize.cu
   src/transform/bools_to_mask.cu
   src/transform/compute_column.cu
   src/transform/encode.cu
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index a72f7dcfa59..44f8f44557c 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -215,5 +215,83 @@ std::unique_ptr<cudf::column> detokenize(
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Vocabulary object to be used with nvtext::tokenize_with_vocabulary
+ *
+ * Use nvtext::load_vocabulary to create this object.
+ */
+struct tokenize_vocabulary {
+  /**
+   * @brief Vocabulary object constructor
+   *
+   * Token ids are the row indices within the vocabulary column.
+   * Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
+   *
+   * @throw cudf::logic_error if `vocabulary` contains nulls or is empty
+   *
+   * @param input Strings for the vocabulary
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  tokenize_vocabulary(cudf::strings_column_view const& input,
+                      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  ~tokenize_vocabulary();
+
+  struct tokenize_vocabulary_impl;
+  tokenize_vocabulary_impl* _impl{};
+};
+
+/**
+ * @brief Create a tokenize_vocabulary object from a strings column
+ *
+ * Token ids are the row indices within the vocabulary column.
+ * Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
+ *
+ * @throw cudf::logic_error if `vocabulary` contains nulls or is empty
+ *
+ * @param input Strings for the vocabulary
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::tokenize_with_vocabulary
+ */
+std::unique_ptr<tokenize_vocabulary> load_vocabulary(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the token ids for the input string by looking up each delimited
+ * token in the given vocabulary
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["hello world", "hello there", "there there world", "watch out world"]
+ * v = load_vocabulary(["hello", "there", "world"])
+ * r = tokenize_with_vocabulary(s,v)
+ * r is now [[0,2], [0,1], [1,1,2], [-1,-1,2]]
+ * @endcode
+ *
+ * Any null row entry results in a corresponding null entry in the output
+ *
+ * @throw cudf::logic_error if `delimiter` is invalid
+ *
+ * @param input Strings column to tokenize
+ * @param vocabulary Used to lookup tokens within
+ * @param delimiter Used to identify tokens within `input`
+ * @param default_id The token id to be used for tokens not found in the `vocabulary`;
+ *                   Default is -1
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Lists column of token ids
+ */
+std::unique_ptr<cudf::column> tokenize_with_vocabulary(
+  cudf::strings_column_view const& input,
+  tokenize_vocabulary const& vocabulary,
+  cudf::string_scalar const& delimiter,
+  cudf::size_type default_id          = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of tokenize group
 }  // namespace nvtext
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
new file mode 100644
index 00000000000..f998c9ec239
--- /dev/null
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <text/utilities/tokenize_ops.cuh>
+
+#include <nvtext/tokenize.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <cuco/static_map.cuh>
+
+namespace nvtext {
+namespace detail {
+namespace {
+
+using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using hash_value_type    = string_hasher_type::result_type;
+
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct vocab_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    return hasher(d_strings.element<cudf::string_view>(index));
+  }
+  // used by find
+  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+};
+
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct vocab_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    return lhs == rhs;  // all rows are expected to be unique
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(lhs) == rhs;
+  }
+};
+
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+using probe_scheme              = cuco::experimental::linear_probing<1, vocab_hasher>;
+using vocabulary_map_type       = cuco::experimental::static_map<cudf::size_type,
+                                                           cudf::size_type,
+                                                           cuco::experimental::extent<std::size_t>,
+                                                           cuda::thread_scope_device,
+                                                           vocab_equal,
+                                                           probe_scheme,
+                                                           hash_table_allocator_type>;
+}  // namespace
+}  // namespace detail
+
+// since column_device_view::create returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type in a maintainable way
+using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
+                                             cudf::column_view,
+                                             rmm::cuda_stream_view>;
+
+struct tokenize_vocabulary::tokenize_vocabulary_impl {
+  std::unique_ptr<cudf::column> const vocabulary;
+  col_device_view const d_vocabulary;
+  std::unique_ptr<detail::vocabulary_map_type> vocabulary_map;
+
+  auto get_map_ref() const { return vocabulary_map->ref(cuco::experimental::op::find); }
+
+  tokenize_vocabulary_impl(std::unique_ptr<cudf::column>&& vocab,
+                           col_device_view&& d_vocab,
+                           std::unique_ptr<detail::vocabulary_map_type>&& map)
+    : vocabulary(std::move(vocab)), d_vocabulary(std::move(d_vocab)), vocabulary_map(std::move(map))
+  {
+  }
+};
+
+struct key_pair {
+  __device__ auto operator()(cudf::size_type idx) const noexcept
+  {
+    return cuco::make_pair(idx, idx);
+  }
+};
+
+tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(not input.is_empty(), "vocabulary must not be empty");
+  CUDF_EXPECTS(not input.has_nulls(), "vocabulary must not have nulls");
+
+  // need to hold a copy of the input
+  auto vocabulary   = std::make_unique<cudf::column>(input.parent(), stream, mr);
+  auto d_vocabulary = cudf::column_device_view::create(vocabulary->view(), stream);
+
+  auto vocab_map = std::make_unique<detail::vocabulary_map_type>(
+    static_cast<size_t>(vocabulary->size() * 2),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    detail::vocab_equal{*d_vocabulary},
+    detail::probe_scheme{detail::vocab_hasher{*d_vocabulary}},
+    detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value());
+
+  // the row index is the token id (value for each key in the map)
+  auto iter = cudf::detail::make_counting_transform_iterator(0, key_pair{});
+  vocab_map->insert_async(iter, iter + vocabulary->size(), stream.value());
+
+  _impl = new tokenize_vocabulary_impl(
+    std::move(vocabulary), std::move(d_vocabulary), std::move(vocab_map));
+}
+tokenize_vocabulary::~tokenize_vocabulary() { delete _impl; }
+
+std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view const& input,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return std::make_unique<tokenize_vocabulary>(input, stream, mr);
+}
+
+namespace detail {
+namespace {
+
+/**
+ * @brief Tokenizes each string and uses the map to assign token id values
+ *
+ * @tparam MapRefType Type of the static_map reference for calling find()
+ */
+template <typename MapRefType>
+struct vocabulary_tokenizer_fn {
+  cudf::column_device_view const d_strings;
+  cudf::string_view const d_delimiter;
+  MapRefType d_map;
+  cudf::size_type const default_id;
+  cudf::size_type const* d_offsets;
+  cudf::size_type* d_results;
+
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    if (d_strings.is_null(idx)) { return; }
+
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    characters_tokenizer tokenizer(d_str, d_delimiter);
+    auto d_tokens = d_results + d_offsets[idx];
+
+    cudf::size_type token_idx = 0;
+    while (tokenizer.next_token()) {
+      auto const pos   = tokenizer.token_byte_positions();
+      auto const token = cudf::string_view{d_str.data() + pos.first, (pos.second - pos.first)};
+      // lookup token in map
+      auto const itr = d_map.find(token);
+      auto const id  = (itr != d_map.end()) ? itr->second : default_id;
+      // set value into the output
+      d_tokens[token_idx++] = id;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view const& input,
+                                                       tokenize_vocabulary const& vocabulary,
+                                                       cudf::string_scalar const& delimiter,
+                                                       cudf::size_type default_id,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  // count the tokens per string and build the offsets from the counts
+  auto const d_strings   = cudf::column_device_view::create(input.parent(), stream);
+  auto const d_delimiter = delimiter.value(stream);
+  auto const sizes_itr =
+    cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
+  auto [token_offsets, total_count] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+
+  // build the output column to hold all the token ids
+  auto tokens =
+    cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto map_ref   = vocabulary._impl->get_map_ref();
+  auto d_offsets = token_offsets->view().data<cudf::size_type>();
+  auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
+  vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
+    *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<cudf::size_type>(0),
+                     input.size(),
+                     tokenizer);
+
+  return cudf::make_lists_column(input.size(),
+                                 std::move(token_offsets),
+                                 std::move(tokens),
+                                 input.null_count(),
+                                 cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                 stream,
+                                 mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view const& input,
+                                                       tokenize_vocabulary const& vocabulary,
+                                                       cudf::string_scalar const& delimiter,
+                                                       cudf::size_type default_id,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::tokenize_with_vocabulary(input, vocabulary, delimiter, default_id, stream, mr);
+}
+
+}  // namespace nvtext
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index 14fc4f8c6db..d78f2dfbdf3 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <nvtext/tokenize.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <nvtext/tokenize.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -125,29 +127,37 @@ TEST_F(TextTokenizeTest, CharacterTokenize)
 
 TEST_F(TextTokenizeTest, TokenizeEmptyTest)
 {
-  auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  cudf::test::strings_column_wrapper all_empty({"", "", ""});
-  cudf::test::strings_column_wrapper all_null({"", "", ""}, {0, 0, 0});
-  cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 0});
-
-  auto results = nvtext::tokenize(cudf::strings_column_view(strings->view()));
+  auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view  = cudf::strings_column_view(input->view());
+  cudf::test::strings_column_wrapper all_empty_wrapper({"", "", ""});
+  auto all_empty = cudf::strings_column_view(all_empty_wrapper);
+  cudf::test::strings_column_wrapper all_null_wrapper({"", "", ""}, {0, 0, 0});
+  auto all_null = cudf::strings_column_view(all_null_wrapper);
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0});
+
+  auto results = nvtext::tokenize(view);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::tokenize(cudf::strings_column_view(all_empty));
+  results = nvtext::tokenize(all_empty);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::tokenize(cudf::strings_column_view(all_null));
+  results = nvtext::tokenize(all_null);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::count_tokens(cudf::strings_column_view(strings->view()));
+  results = nvtext::count_tokens(view);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::count_tokens(cudf::strings_column_view(all_empty));
+  results = nvtext::count_tokens(all_empty);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = nvtext::count_tokens(cudf::strings_column_view(all_null));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = nvtext::character_tokenize(cudf::strings_column_view(strings->view()));
+  results = nvtext::character_tokenize(view);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::character_tokenize(cudf::strings_column_view(all_empty));
+  results = nvtext::character_tokenize(all_empty);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::character_tokenize(cudf::strings_column_view(all_null));
+  results = nvtext::character_tokenize(all_null);
   EXPECT_EQ(results->size(), 0);
+  auto const delimiter = cudf::string_scalar{""};
+  results              = nvtext::tokenize_with_vocabulary(view, all_empty, delimiter);
+  EXPECT_EQ(results->size(), 0);
+  results = nvtext::tokenize_with_vocabulary(all_null, all_empty, delimiter);
+  EXPECT_EQ(results->size(), results->null_count());
 }
 
 TEST_F(TextTokenizeTest, Detokenize)
@@ -191,3 +201,50 @@ TEST_F(TextTokenizeTest, DetokenizeErrors)
   EXPECT_THROW(nvtext::detokenize(strings_view, one, cudf::string_scalar("", false)),
                cudf::logic_error);
 }
+
+TEST_F(TextTokenizeTest, Vocabulary)
+{
+  cudf::test::strings_column_wrapper vocabulary(  // leaving out 'cat' on purpose
+    {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
+  auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
+
+  auto validity = cudf::test::iterators::null_at(1);
+  cudf::test::strings_column_wrapper input({"the fox jumped over the dog",
+                                            "the dog chased the cat",
+                                            "the cat chased the mouse",
+                                            "the mousé  ate  cheese",
+                                            "",
+                                            ""},
+                                           validity);
+  auto input_view = cudf::strings_column_view(input);
+  auto delimiter  = cudf::string_scalar(" ");
+  auto default_id = -7;  // should be the token for the missing 'cat'
+  auto results    = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
+  // clang-format off
+  LCW expected({LCW{ 9, 4, 5, 8, 9, 3},
+                LCW{ 9, 3, 1, 9,-7},
+                LCW{ 9,-7, 1, 9, 6},
+                LCW{ 9, 7, 0, 2},
+                LCW{}, LCW{}},
+                validity);
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(TextTokenizeTest, TokenizeErrors)
+{
+  cudf::test::strings_column_wrapper empty{};
+  cudf::strings_column_view view(empty);
+  EXPECT_THROW(nvtext::load_vocabulary(view), cudf::logic_error);
+
+  cudf::test::strings_column_wrapper vocab_nulls({""}, {0});
+  cudf::strings_column_view nulls(vocab_nulls);
+  EXPECT_THROW(nvtext::load_vocabulary(nulls), cudf::logic_error);
+
+  cudf::test::strings_column_wrapper some{"hello"};
+  auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(some));
+  EXPECT_THROW(nvtext::tokenize_with_vocabulary(view, *vocab, cudf::string_scalar("", false)),
+               cudf::logic_error);
+}
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
index 8b80f50e381..3cc3fd6251a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
@@ -1,10 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.cpp.types cimport size_type
 
 
 cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
@@ -38,3 +39,17 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
         const column_view & row_indices,
         const string_scalar & separator
     ) except +
+
+    cdef struct tokenize_vocabulary "nvtext::tokenize_vocabulary":
+        pass
+
+    cdef unique_ptr[tokenize_vocabulary] load_vocabulary(
+        const column_view & strings
+    ) except +
+
+    cdef unique_ptr[column] tokenize_with_vocabulary(
+        const column_view & strings,
+        const tokenize_vocabulary & vocabulary,
+        const string_scalar & delimiter,
+        size_type default_id
+    ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index 2bb4fa8e108..bee9d6f6c4d 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -12,9 +12,13 @@ from cudf._lib.cpp.nvtext.tokenize cimport (
     character_tokenize as cpp_character_tokenize,
     count_tokens as cpp_count_tokens,
     detokenize as cpp_detokenize,
+    load_vocabulary as cpp_load_vocabulary,
     tokenize as cpp_tokenize,
+    tokenize_vocabulary as cpp_tokenize_vocabulary,
+    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
@@ -122,3 +126,37 @@ def detokenize(Column strings, Column indices, object py_separator):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+cdef class TokenizeVocabulary:
+    cdef unique_ptr[cpp_tokenize_vocabulary] c_obj
+
+    def __cinit__(self, Column vocab):
+        cdef column_view c_vocab = vocab.view()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary(c_vocab))
+
+
+@acquire_spill_lock()
+def tokenize_with_vocabulary(Column strings,
+                             TokenizeVocabulary vocabulary,
+                             object py_delimiter,
+                             size_type default_id):
+
+    cdef DeviceScalar delimiter = py_delimiter.device_value
+    cdef column_view c_strings = strings.view()
+    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_tokenize_with_vocabulary(
+                c_strings,
+                vocabulary.c_obj.get()[0],
+                c_delimiter[0],
+                default_id
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 16875e4397e..47a194c4fda 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -23,6 +23,7 @@
     _tokenize_scalar,
     character_tokenize,
     detokenize,
+    tokenize_with_vocabulary,
 )
 from cudf._lib.strings.attributes import (
     code_points,
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
new file mode 100644
index 00000000000..afb3496311b
--- /dev/null
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.tokenize import (
+    TokenizeVocabulary as cpp_tokenize_vocabulary,
+    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
+)
+
+
+class TokenizeVocabulary:
+    """
+    A vocabulary object used to tokenize input text.
+
+    Parameters
+    ----------
+    vocabulary : str
+        Strings column of vocabulary terms
+    """
+
+    def __init__(self, vocabulary: "cudf.Series"):
+        self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
+
+    def tokenize(self, text, delimiter: str = "", default_id: int = -1):
+        """
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be tokenized.
+        delimiter : str
+            Delimiter to identify tokens. Default is whitespace.
+        default_id : int
+            Value to use for tokens not found in the vocabulary.
+            Default is -1.
+
+        Returns
+        -------
+        Tokenized strings
+        """
+        if delimiter is None:
+            delimiter = ""
+        delim = cudf.Scalar(delimiter, dtype="str")
+        result = cpp_tokenize_with_vocabulary(
+            text._column, self.vocabulary, delim, default_id
+        )
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 8cda15e4acc..2241390a531 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
 
@@ -156,6 +157,64 @@ def test_token_count(delimiter, expected_token_counts):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "delimiter, input, default_id, results",
+    [
+        (
+            "",
+            "the quick brown fox jumps over the lazy brown dog",
+            99,
+            [0, 1, 2, 3, 4, 5, 0, 99, 2, 6],
+        ),
+        (
+            " ",
+            " the sable siamésé cat jumps under the brown sofa ",
+            -1,
+            [0, 7, 8, 9, 4, 10, 0, 2, 11],
+        ),
+        (
+            "_",
+            "the_quick_brown_fox_jumped__over_the_lazy_brown_dog",
+            -99,
+            [0, 1, 2, 3, -99, 5, 0, -99, 2, 6],
+        ),
+    ],
+)
+def test_tokenize_with_vocabulary(delimiter, input, default_id, results):
+    vocabulary = cudf.Series(
+        [
+            "the",
+            "quick",
+            "brown",
+            "fox",
+            "jumps",
+            "over",
+            "dog",
+            "sable",
+            "siamésé",
+            "cat",
+            "under",
+            "sofa",
+        ]
+    )
+    tokenizer = TokenizeVocabulary(vocabulary)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series(
+        [
+            cudf.Series(results, dtype=np.int32),
+            None,
+            cudf.Series([], dtype=np.int32),
+            cudf.Series(results, dtype=np.int32),
+        ]
+    )
+
+    actual = tokenizer.tokenize(strings, delimiter, default_id)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
+
+
 def test_normalize_spaces():
     strings = cudf.Series(
         [

From 31e56702fe15f44b3e849207d31d0bb79c307367 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 27 Sep 2023 09:29:35 +0530
Subject: [PATCH 219/230] Workaround for illegal instruction error in sm90 for
 warp instrinsics with mask (#14201)

Workaround for illegal instruction error in sm90 for warp instrinsics with non `0xffffffff` mask
Removed the mask, and used ~0u (`0xffffffff`) as MASK because
- all threads in warp has correct data on error since is_within_bounds==true thread update error.
- init_state is not required at last iteration only where MASK is not ~0u.

Fixes #14183

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Elias Stehle (https://github.com/elstehle)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/14201
---
 cpp/src/io/utilities/data_casting.cu | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index d16237d7afe..9e5c5c76392 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -534,8 +534,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
          char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
       bool const is_within_bounds = char_index < (in_end - in_begin);
-      auto const MASK   = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff;
-      auto const c      = is_within_bounds ? in_begin[char_index] : '\0';
+      auto const c                = is_within_bounds ? in_begin[char_index] : '\0';
       auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
       auto const escaped_char = get_escape_char(c);
 
@@ -571,7 +570,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
         SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
         is_escaping_backslash = scanned.get(init_state);
-        init_state            = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
+        init_state            = __shfl_sync(~0u, is_escaping_backslash, BLOCK_SIZE - 1);
         __syncwarp();
         is_slash.shift(warp_id);
         is_slash.set_bits(warp_id, is_escaping_backslash);
@@ -604,7 +603,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       }
       // Make sure all threads have no errors before continuing
       if constexpr (is_warp) {
-        error = __any_sync(MASK, error);
+        error = __any_sync(~0u, error);
       } else {
         using ErrorReduce = cub::BlockReduce<bool, BLOCK_SIZE>;
         __shared__ typename ErrorReduce::TempStorage temp_storage_error;
@@ -932,13 +931,8 @@ std::unique_ptr<column> parse_data(
   auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});
 
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
-    return parse_string(str_tuples,
-                        col_size,
-                        std::forward<rmm::device_buffer>(null_mask),
-                        d_null_count,
-                        options,
-                        stream,
-                        mr);
+    return parse_string(
+      str_tuples, col_size, std::move(null_mask), d_null_count, options, stream, mr);
   }
 
   auto out_col =

From ce247961216dd70f389763dc086f137c11ad7346 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 27 Sep 2023 10:10:31 -0700
Subject: [PATCH 220/230] Implement `HISTOGRAM` and `MERGE_HISTOGRAM`
 aggregations (#14045)

This adds two more aggregations for groupby and reduction:
 * `HISTOGRAM`: Count the number of occurrences (aka frequency) for each element, and
 * `MERGE_HISTOGRAM`: Merge different outputs generated by `HISTOGRAM` aggregations

This is the prerequisite for implementing the exact distributed percentile aggregation (https://github.com/rapidsai/cudf/issues/13885). However, these two new aggregations may be useful in other use-cases that need to do frequency counting.

Closes https://github.com/rapidsai/cudf/issues/13885.

Merging checklist:
 * [X] Working prototypes.
 * [X] Cleanup and docs.
 * [X]  Unit test.
 * [ ] Test with spark-rapids integration tests.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14045
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cudf/aggregation.hpp              |  22 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  60 +++
 .../cudf/detail/hash_reduce_by_row.cuh        |   4 +
 .../cudf/reduction/detail/histogram.hpp       |  57 +++
 .../reduction/detail/reduction_functions.hpp  |  27 ++
 cpp/src/aggregation/aggregation.cpp           |  42 ++
 cpp/src/groupby/groupby.cu                    |  10 +
 cpp/src/groupby/sort/aggregate.cpp            |  30 ++
 cpp/src/groupby/sort/group_histogram.cu       | 152 +++++++
 cpp/src/groupby/sort/group_reductions.hpp     |  57 ++-
 cpp/src/reductions/histogram.cu               | 273 ++++++++++++
 cpp/src/reductions/reductions.cpp             |  12 +
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/groupby/histogram_tests.cpp         | 396 ++++++++++++++++++
 cpp/tests/reductions/reduction_tests.cpp      | 207 +++++++++
 16 files changed, 1349 insertions(+), 3 deletions(-)
 create mode 100644 cpp/include/cudf/reduction/detail/histogram.hpp
 create mode 100644 cpp/src/groupby/sort/group_histogram.cu
 create mode 100644 cpp/src/reductions/histogram.cu
 create mode 100644 cpp/tests/groupby/histogram_tests.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9656bc40fd7..ec58c391001 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -323,6 +323,7 @@ add_library(
   src/groupby/sort/group_collect.cu
   src/groupby/sort/group_correlation.cu
   src/groupby/sort/group_count.cu
+  src/groupby/sort/group_histogram.cu
   src/groupby/sort/group_m2.cu
   src/groupby/sort/group_max.cu
   src/groupby/sort/group_min.cu
@@ -471,6 +472,7 @@ add_library(
   src/reductions/all.cu
   src/reductions/any.cu
   src/reductions/collect_ops.cu
+  src/reductions/histogram.cu
   src/reductions/max.cu
   src/reductions/mean.cu
   src/reductions/min.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index d319041f8b1..d458c831f19 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,7 +116,9 @@ class aggregation {
     COVARIANCE,      ///< covariance between two sets of elements
     CORRELATION,     ///< correlation between two sets of elements
     TDIGEST,         ///< create a tdigest from a set of input values
-    MERGE_TDIGEST    ///< create a tdigest by merging multiple tdigests together
+    MERGE_TDIGEST,   ///< create a tdigest by merging multiple tdigests together
+    HISTOGRAM,       ///< compute frequency of each element
+    MERGE_HISTOGRAM  ///< merge partial values of HISTOGRAM aggregation,
   };
 
   aggregation() = delete;
@@ -288,6 +290,11 @@ std::unique_ptr<Base> make_any_aggregation();
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_all_aggregation();
 
+/// Factory to create a HISTOGRAM aggregation
+/// @return A HISTOGRAM aggregation object
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_histogram_aggregation();
+
 /// Factory to create a SUM_OF_SQUARES aggregation
 /// @return A SUM_OF_SQUARES aggregation object
 template <typename Base = aggregation>
@@ -610,6 +617,17 @@ std::unique_ptr<Base> make_merge_sets_aggregation(
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
+/**
+ * @brief Factory to create a MERGE_HISTOGRAM aggregation
+ *
+ * Merges the results of `HISTOGRAM` aggregations on independent sets into a new `HISTOGRAM` value
+ * equivalent to if a single `HISTOGRAM` aggregation was done across all of the sets at once.
+ *
+ * @return A MERGE_HISTOGRAM aggregation object
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_merge_histogram_aggregation();
+
 /**
  * @brief Factory to create a COVARIANCE aggregation
  *
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 4d3984cab93..784f05a964e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -45,6 +45,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class max_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class count_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class histogram_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class any_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -89,6 +91,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_m2_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class merge_histogram_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class covariance_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -108,6 +112,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class min_aggregation const& agg);
   virtual void visit(class max_aggregation const& agg);
   virtual void visit(class count_aggregation const& agg);
+  virtual void visit(class histogram_aggregation const& agg);
   virtual void visit(class any_aggregation const& agg);
   virtual void visit(class all_aggregation const& agg);
   virtual void visit(class sum_of_squares_aggregation const& agg);
@@ -130,6 +135,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
+  virtual void visit(class merge_histogram_aggregation const& agg);
   virtual void visit(class covariance_aggregation const& agg);
   virtual void visit(class correlation_aggregation const& agg);
   virtual void visit(class tdigest_aggregation const& agg);
@@ -251,6 +257,25 @@ class count_aggregation final : public rolling_aggregation,
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived class for specifying a histogram aggregation
+ */
+class histogram_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  histogram_aggregation() : aggregation(HISTOGRAM) {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<histogram_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived class for specifying an any aggregation
  */
@@ -972,6 +997,25 @@ class merge_m2_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying MERGE_HISTOGRAM aggregation
+ */
+class merge_histogram_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  explicit merge_histogram_aggregation() : aggregation{MERGE_HISTOGRAM} {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_histogram_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived aggregation class for specifying COVARIANCE aggregation
  */
@@ -1148,6 +1192,12 @@ struct target_type_impl<Source, aggregation::COUNT_ALL> {
   using type = size_type;
 };
 
+// Use list for HISTOGRAM
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::HISTOGRAM> {
+  using type = list_view;
+};
+
 // Computing ANY of any type, use bool accumulator
 template <typename Source>
 struct target_type_impl<Source, aggregation::ANY> {
@@ -1326,6 +1376,12 @@ struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = struct_view;
 };
 
+// Use list for MERGE_HISTOGRAM
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::MERGE_HISTOGRAM> {
+  using type = list_view;
+};
+
 // Always use double for COVARIANCE
 template <typename SourceType>
 struct target_type_impl<SourceType, aggregation::COVARIANCE> {
@@ -1417,6 +1473,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::COUNT_VALID>(std::forward<Ts>(args)...);
     case aggregation::COUNT_ALL:
       return f.template operator()<aggregation::COUNT_ALL>(std::forward<Ts>(args)...);
+    case aggregation::HISTOGRAM:
+      return f.template operator()<aggregation::HISTOGRAM>(std::forward<Ts>(args)...);
     case aggregation::ANY:
       return f.template operator()<aggregation::ANY>(std::forward<Ts>(args)...);
     case aggregation::ALL:
@@ -1460,6 +1518,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_M2:
       return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_HISTOGRAM:
+      return f.template operator()<aggregation::MERGE_HISTOGRAM>(std::forward<Ts>(args)...);
     case aggregation::COVARIANCE:
       return f.template operator()<aggregation::COVARIANCE>(std::forward<Ts>(args)...);
     case aggregation::CORRELATION:
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 2d2b43f1d4a..f63d1922950 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -14,12 +14,15 @@
  * limitations under the License.
  */
 
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -29,6 +32,7 @@
 
 namespace cudf::detail {
 
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 using hash_map_type =
   cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
 
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
new file mode 100644
index 00000000000..97c711fda4e
--- /dev/null
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <optional>
+
+namespace cudf::reduction::detail {
+
+/**
+ * @brief Compute the frequency for each distinct row in the input table.
+ *
+ * @param input The input table to compute histogram
+ * @param partial_counts An optional column containing count for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate memory of the returned objects
+ * @return A pair of array contains the (stable-order) indices of the distinct rows in the input
+ * table, and their corresponding distinct counts
+ */
+[[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
+compute_row_frequencies(table_view const& input,
+                        std::optional<column_view> const& partial_counts,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Create an empty histogram column.
+ *
+ * A histogram column is a structs column `STRUCT<T, int64_t>` where T is type of the input
+ * values.
+ *
+ * @returns An empty histogram column
+ */
+[[nodiscard]] std::unique_ptr<column> make_empty_histogram_like(column_view const& values);
+
+}  // namespace cudf::reduction::detail
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 014a6ba70eb..704332c8e1d 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -131,6 +131,33 @@ std::unique_ptr<scalar> all(column_view const& col,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Compute frequency for each unique element in the input column.
+ *
+ * The result histogram is stored in structs column having two children. The first child contains
+ * unique elements from the input, and the second child contains their corresponding frequencies.
+ *
+ * @param input The column to compute histogram
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return A list_scalar storing a structs column as the result histogram
+ */
+std::unique_ptr<scalar> histogram(column_view const& input,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Merge multiple histograms together.
+ *
+ * @param input The input given as multiple histograms concatenated together
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return A list_scalar storing the result histogram
+ */
+std::unique_ptr<scalar> merge_histogram(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Computes product of elements in input column
  *
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 2e6a643484e..b3f2a774a60 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -64,6 +64,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, histogram_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, any_aggregation const& agg)
 {
@@ -196,6 +202,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, merge_histogram_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, covariance_aggregation const& agg)
 {
@@ -246,6 +258,10 @@ void aggregation_finalizer::visit(count_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
+void aggregation_finalizer::visit(histogram_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
 
 void aggregation_finalizer::visit(any_aggregation const& agg)
 {
@@ -357,6 +373,11 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(merge_histogram_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(covariance_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -460,6 +481,16 @@ template std::unique_ptr<groupby_aggregation> make_count_aggregation<groupby_agg
 template std::unique_ptr<groupby_scan_aggregation> make_count_aggregation<groupby_scan_aggregation>(
   null_policy null_handling);
 
+/// Factory to create a HISTOGRAM aggregation
+template <typename Base>
+std::unique_ptr<Base> make_histogram_aggregation()
+{
+  return std::make_unique<detail::histogram_aggregation>();
+}
+template std::unique_ptr<aggregation> make_histogram_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_histogram_aggregation<groupby_aggregation>();
+template std::unique_ptr<reduce_aggregation> make_histogram_aggregation<reduce_aggregation>();
+
 /// Factory to create a ANY aggregation
 template <typename Base>
 std::unique_ptr<Base> make_any_aggregation()
@@ -764,6 +795,17 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
+/// Factory to create a MERGE_HISTOGRAM aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_histogram_aggregation()
+{
+  return std::make_unique<detail::merge_histogram_aggregation>();
+}
+template std::unique_ptr<aggregation> make_merge_histogram_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation>
+make_merge_histogram_aggregation<groupby_aggregation>();
+template std::unique_ptr<reduce_aggregation> make_merge_histogram_aggregation<reduce_aggregation>();
+
 /// Factory to create a COVARIANCE aggregation
 template <typename Base>
 std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods, size_type ddof)
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index ce1fc71968f..e3c021eb66a 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -28,6 +28,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -110,6 +111,15 @@ struct empty_column_constructor {
         0, make_empty_column(type_to_id<size_type>()), empty_like(values), 0, {});
     }
 
+    if constexpr (k == aggregation::Kind::HISTOGRAM) {
+      return make_lists_column(0,
+                               make_empty_column(type_to_id<size_type>()),
+                               cudf::reduction::detail::make_empty_histogram_like(values),
+                               0,
+                               {});
+    }
+    if constexpr (k == aggregation::Kind::MERGE_HISTOGRAM) { return empty_like(values); }
+
     if constexpr (k == aggregation::Kind::RANK) {
       auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
       if (rank_agg._method == cudf::rank_method::AVERAGE or
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 3f977dc81d7..10c271f76f9 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -89,6 +89,18 @@ void aggregate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation co
     detail::group_count_all(helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
+template <>
+void aggregate_result_functor::operator()<aggregation::HISTOGRAM>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) return;
+
+  cache.add_result(
+    values,
+    agg,
+    detail::group_histogram(
+      get_grouped_values(), helper.group_labels(stream), helper.num_groups(stream), stream, mr));
+}
+
 template <>
 void aggregate_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
 {
@@ -534,6 +546,24 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
+/**
+ * @brief Perform merging for multiple histograms that correspond to the same key value.
+ *
+ * The partial results input to this aggregation is a structs column that is concatenated from
+ * multiple outputs of HISTOGRAM aggregations.
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_HISTOGRAM>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) { return; }
+
+  cache.add_result(
+    values,
+    agg,
+    detail::group_merge_histogram(
+      get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
+}
+
 /**
  * @brief Creates column views with only valid elements in both input column views
  *
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
new file mode 100644
index 00000000000..bb70037aaef
--- /dev/null
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <lists/utilities.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/labeling/label_segments.cuh>
+#include <cudf/reduction/detail/histogram.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <thrust/gather.h>
+
+namespace cudf::groupby::detail {
+
+namespace {
+
+std::unique_ptr<column> build_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        std::optional<column_view> const& partial_counts,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
+               "Size of values column should be the same as that of group labels.",
+               std::invalid_argument);
+
+  // Attach group labels to the input values.
+  auto const labels_cv      = column_view{data_type{type_to_id<size_type>()},
+                                     static_cast<size_type>(group_labels.size()),
+                                     group_labels.data(),
+                                     nullptr,
+                                     0};
+  auto const labeled_values = table_view{{labels_cv, values}};
+
+  // Build histogram for the labeled values.
+  auto [distinct_indices, distinct_counts] =
+    cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr);
+
+  // Gather the distinct rows for the output histogram.
+  auto out_table = cudf::detail::gather(labeled_values,
+                                        *distinct_indices,
+                                        out_of_bounds_policy::DONT_CHECK,
+                                        cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                        stream,
+                                        mr);
+
+  // Build offsets for the output lists column containing output histograms.
+  // Each list will be a histogram corresponding to one value group.
+  auto out_offsets = cudf::lists::detail::reconstruct_offsets(
+    out_table->get_column(0).view(), num_groups, stream, mr);
+
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(std::move(out_table->release().back()));
+  struct_children.emplace_back(std::move(distinct_counts));
+  auto out_structs = make_structs_column(static_cast<size_type>(distinct_indices->size()),
+                                         std::move(struct_children),
+                                         0,
+                                         {},
+                                         stream,
+                                         mr);
+
+  return make_lists_column(
+    num_groups, std::move(out_offsets), std::move(out_structs), 0, {}, stream, mr);
+}
+
+}  // namespace
+
+std::unique_ptr<column> group_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
+
+  return build_histogram(values, group_labels, std::nullopt, num_groups, stream, mr);
+}
+
+std::unique_ptr<column> group_merge_histogram(column_view const& values,
+                                              cudf::device_span<size_type const> group_offsets,
+                                              size_type num_groups,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
+
+  // The input must be a lists column without nulls.
+  CUDF_EXPECTS(!values.has_nulls(), "The input column must not have nulls.", std::invalid_argument);
+  CUDF_EXPECTS(values.type().id() == type_id::LIST,
+               "The input of MERGE_HISTOGRAM aggregation must be a lists column.",
+               std::invalid_argument);
+
+  // Child of the input lists column must be a structs column without nulls,
+  // and its second child is a columns of integer type having no nulls.
+  auto const lists_cv     = lists_column_view{values};
+  auto const histogram_cv = lists_cv.get_sliced_child(stream);
+  CUDF_EXPECTS(!histogram_cv.has_nulls(),
+               "Child of the input lists column must not have nulls.",
+               std::invalid_argument);
+  CUDF_EXPECTS(histogram_cv.type().id() == type_id::STRUCT && histogram_cv.num_children() == 2,
+               "The input column has invalid histograms structure.",
+               std::invalid_argument);
+  CUDF_EXPECTS(
+    cudf::is_integral(histogram_cv.child(1).type()) && !histogram_cv.child(1).has_nulls(),
+    "The input column has invalid histograms structure.",
+    std::invalid_argument);
+
+  // Concatenate the histograms corresponding to the same key values.
+  // That is equivalent to creating a new lists column (view) from the input lists column
+  // with new offsets gathered as below.
+  auto new_offsets = rmm::device_uvector<size_type>(num_groups + 1, stream);
+  thrust::gather(rmm::exec_policy(stream),
+                 group_offsets.begin(),
+                 group_offsets.end(),
+                 lists_cv.offsets_begin(),
+                 new_offsets.begin());
+
+  // Generate labels for the new lists.
+  auto key_labels = rmm::device_uvector<size_type>(histogram_cv.size(), stream);
+  cudf::detail::label_segments(
+    new_offsets.begin(), new_offsets.end(), key_labels.begin(), key_labels.end(), stream);
+
+  auto const structs_cv   = structs_column_view{histogram_cv};
+  auto const input_values = structs_cv.get_sliced_child(0, stream);
+  auto const input_counts = structs_cv.get_sliced_child(1, stream);
+
+  return build_histogram(input_values, key_labels, input_counts, num_groups, stream, mr);
+}
+
+}  // namespace cudf::groupby::detail
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index fc24b679db5..3aa79f226a3 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -216,6 +216,33 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
+/**
+ * @brief Internal API to compute histogram for each group in @p values.
+ *
+ * The returned column is a lists column, each list corresponds to one input group and stores the
+ * histogram of the distinct elements in that group in the form of `STRUCT<value, count>`.
+ *
+ * Note that the order of distinct elements in each output list is not specified.
+ *
+ * @code{.pseudo}
+ * values       = [2, 1, 1, 3, 5, 2, 2, 3, 1, 4]
+ * group_labels = [0, 0, 0, 1, 1, 1, 1, 1, 2, 2]
+ * num_groups   = 3
+ *
+ * output = [[<1, 2>, <2, 1>], [<2, 2>, <3, 2>, <5, 1>], [<1, 1>, <4, 1>]]
+ * @endcode
+ *
+ * @param values Grouped values to compute histogram
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param num_groups Number of groups
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate sum of squares of differences from means.
@@ -441,6 +468,34 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to merge multiple output of HISTOGRAM aggregation.
+ *
+ * The input values column should be given as a lists column in the form of
+ * `LIST<STRUCT<value, count>>`.
+ * After merging, the order of distinct elements in each output list is not specified.
+ *
+ * @code{.pseudo}
+ * values        = [ [<1, 2>, <2, 1>], [<2, 2>], [<3, 2>, <2, 1>], [<1, 1>, <2, 1>] ]
+ * group_offsets = [ 0,                          2,                                 4]
+ * num_groups    = 2
+ *
+ * output = [[<1, 2>, <2, 3>], [<1, 1>, <2, 2>, <3, 2>]]]
+ * @endcode
+ *
+ * @param values Grouped values to get valid count of
+ * @param group_offsets Offsets of groups' starting points within @p values
+ * @param num_groups Number of groups
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_merge_histogram(column_view const& values,
+                                              cudf::device_span<size_type const> group_offsets,
+                                              size_type num_groups,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Internal API to find covariance of child columns of a non-nullable struct column.
  *
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
new file mode 100644
index 00000000000..fa84bbeb25d
--- /dev/null
+++ b/cpp/src/reductions/histogram.cu
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/hash_reduce_by_row.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <cuda/atomic>
+
+#include <optional>
+
+namespace cudf::reduction::detail {
+
+namespace {
+
+// Always use 64-bit signed integer for storing count.
+using histogram_count_type = int64_t;
+
+/**
+ * @brief The functor to accumulate the frequency of each distinct rows in the input table.
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual, typename CountType>
+struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType> {
+  CountType const* d_partial_output;
+
+  reduce_fn(MapView const& d_map,
+            KeyHasher const& d_hasher,
+            KeyEqual const& d_equal,
+            CountType* const d_output,
+            CountType const* const d_partial_output)
+    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType>{d_map,
+                                                                                   d_hasher,
+                                                                                   d_equal,
+                                                                                   d_output},
+      d_partial_output{d_partial_output}
+  {
+  }
+
+  // Count the number of rows in each group of rows that are compared equal.
+  __device__ void operator()(size_type const idx) const
+  {
+    auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1};
+    auto const count =
+      cuda::atomic_ref<CountType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
+    count.fetch_add(increment, cuda::std::memory_order_relaxed);
+  }
+};
+
+/**
+ * @brief The builder to construct an instance of `reduce_fn` functor.
+ */
+template <typename CountType>
+struct reduce_func_builder {
+  CountType const* const d_partial_output;
+
+  reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output}
+  {
+  }
+
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             CountType* const d_output)
+  {
+    return reduce_fn<MapView, KeyHasher, KeyEqual, CountType>{
+      d_map, d_hasher, d_equal, d_output, d_partial_output};
+  }
+};
+
+/**
+ * @brief Specialized functor to check for not-zero of the second component of the input.
+ */
+struct is_not_zero {
+  template <typename Pair>
+  __device__ bool operator()(Pair const input) const
+  {
+    return thrust::get<1>(input) != 0;
+  }
+};
+
+/**
+ * @brief Building a histogram by gathering distinct rows from the input table and their
+ * corresponding distinct counts.
+ *
+ * @param input The input table
+ * @param distinct_indices Indices of the distinct rows
+ * @param distinct_counts Distinct counts corresponding to the distinct rows
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned object's device memory
+ * @return A list_scalar storing the output histogram
+ */
+auto gather_histogram(table_view const& input,
+                      device_span<size_type const> distinct_indices,
+                      std::unique_ptr<column>&& distinct_counts,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
+{
+  auto distinct_rows = cudf::detail::gather(input,
+                                            distinct_indices,
+                                            out_of_bounds_policy::DONT_CHECK,
+                                            cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                            stream,
+                                            mr);
+
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(std::move(distinct_rows->release().front()));
+  struct_children.emplace_back(std::move(distinct_counts));
+  auto output_structs = make_structs_column(
+    static_cast<size_type>(distinct_indices.size()), std::move(struct_children), 0, {}, stream, mr);
+
+  return std::make_unique<cudf::list_scalar>(
+    std::move(*output_structs.release()), true, stream, mr);
+}
+
+}  // namespace
+
+std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
+{
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(empty_like(values));
+  struct_children.emplace_back(make_numeric_column(data_type{type_id::INT64}, 0));
+  return std::make_unique<column>(data_type{type_id::STRUCT},
+                                  0,
+                                  rmm::device_buffer{},
+                                  rmm::device_buffer{},
+                                  0,
+                                  std::move(struct_children));
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
+compute_row_frequencies(table_view const& input,
+                        std::optional<column_view> const& partial_counts,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
+{
+  auto const has_nested_columns = cudf::detail::has_nested_columns(input);
+
+  // Nested types are not tested, thus we just throw exception if we see such input for now.
+  // We should remove this check after having enough tests.
+  CUDF_EXPECTS(!has_nested_columns,
+               "Nested types are not yet supported in histogram aggregation.",
+               std::invalid_argument);
+
+  auto map = cudf::detail::hash_map_type{
+    compute_hash_table_size(input.num_rows()),
+    cuco::empty_key{-1},
+    cuco::empty_value{std::numeric_limits<size_type>::min()},
+    cudf::detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value()};
+
+  auto const preprocessed_input =
+    cudf::experimental::row::hash::preprocessed_table::create(input, stream);
+  auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
+
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const key_hasher = row_hasher.device_hasher(has_nulls);
+  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(size_type const i) { return cuco::make_pair(i, i); });
+
+  // Always compare NaNs as equal.
+  using nan_equal_comparator =
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+  auto const value_comp = nan_equal_comparator{};
+
+  if (has_nested_columns) {
+    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
+    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+  } else {
+    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+  }
+
+  // Gather the indices of distinct rows.
+  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(
+    static_cast<size_type>(map.get_size()), stream, mr);
+
+  // Store the number of occurrences of each distinct row.
+  auto distinct_counts = make_numeric_column(data_type{type_to_id<histogram_count_type>()},
+                                             static_cast<size_type>(map.get_size()),
+                                             mask_state::UNALLOCATED,
+                                             stream,
+                                             mr);
+
+  // Compute frequencies (aka distinct counts) for the input rows.
+  // Note that we consider null and NaNs as always equal.
+  auto const reduction_results = cudf::detail::hash_reduce_by_row(
+    map,
+    preprocessed_input,
+    input.num_rows(),
+    has_nulls,
+    has_nested_columns,
+    null_equality::EQUAL,
+    nan_equality::ALL_EQUAL,
+    reduce_func_builder<histogram_count_type>{
+      partial_counts ? partial_counts.value().begin<histogram_count_type>() : nullptr},
+    histogram_count_type{0},
+    stream,
+    rmm::mr::get_current_device_resource());
+
+  auto const input_it = thrust::make_zip_iterator(
+    thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
+  auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
+    distinct_indices->begin(), distinct_counts->mutable_view().begin<histogram_count_type>()));
+
+  // Reduction results above are either group sizes of equal rows, or `0`.
+  // The final output is non-zero group sizes only.
+  thrust::copy_if(
+    rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{});
+
+  return {std::move(distinct_indices), std::move(distinct_counts)};
+}
+
+std::unique_ptr<cudf::scalar> histogram(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
+
+  auto const input_tv = table_view{{input}};
+  auto [distinct_indices, distinct_counts] =
+    compute_row_frequencies(input_tv, std::nullopt, stream, mr);
+  return gather_histogram(input_tv, *distinct_indices, std::move(distinct_counts), stream, mr);
+}
+
+std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
+  CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.", std::invalid_argument);
+  CUDF_EXPECTS(input.type().id() == type_id::STRUCT && input.num_children() == 2,
+               "The input must be a structs column having two children.",
+               std::invalid_argument);
+  CUDF_EXPECTS(cudf::is_integral(input.child(1).type()) && !input.child(1).has_nulls(),
+               "The second child of the input column must be of integral type and without nulls.",
+               std::invalid_argument);
+
+  auto const structs_cv   = structs_column_view{input};
+  auto const input_values = structs_cv.get_sliced_child(0, stream);
+  auto const input_counts = structs_cv.get_sliced_child(1, stream);
+
+  auto const values_tv = table_view{{input_values}};
+  auto [distinct_indices, distinct_counts] =
+    compute_row_frequencies(values_tv, input_counts, stream, mr);
+  return gather_histogram(values_tv, *distinct_indices, std::move(distinct_counts), stream, mr);
+}
+
+}  // namespace cudf::reduction::detail
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 2fef8aa8785..23171baaa45 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -23,6 +23,7 @@
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -59,6 +60,8 @@ struct reduce_dispatch_functor {
       case aggregation::MAX: return max(col, output_dtype, init, stream, mr);
       case aggregation::ANY: return any(col, output_dtype, init, stream, mr);
       case aggregation::ALL: return all(col, output_dtype, init, stream, mr);
+      case aggregation::HISTOGRAM: return histogram(col, stream, mr);
+      case aggregation::MERGE_HISTOGRAM: return merge_histogram(col, stream, mr);
       case aggregation::SUM_OF_SQUARES: return sum_of_squares(col, output_dtype, stream, mr);
       case aggregation::MEAN: return mean(col, output_dtype, stream, mr);
       case aggregation::VARIANCE: {
@@ -165,6 +168,15 @@ std::unique_ptr<scalar> reduce(column_view const& col,
       return tdigest::detail::make_empty_tdigest_scalar(stream, mr);
     }
 
+    if (agg.kind == aggregation::HISTOGRAM) {
+      return std::make_unique<list_scalar>(
+        std::move(*reduction::detail::make_empty_histogram_like(col)), true, stream, mr);
+    }
+    if (agg.kind == aggregation::MERGE_HISTOGRAM) {
+      return std::make_unique<list_scalar>(
+        std::move(*reduction::detail::make_empty_histogram_like(col.child(0))), true, stream, mr);
+    }
+
     if (output_dtype.id() == type_id::LIST) {
       if (col.type() == output_dtype) { return make_empty_scalar_like(col, stream, mr); }
       // Under some circumstance, the output type will become the List of input type,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 68ff6c54c99..04939f3cd6d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -116,6 +116,7 @@ ConfigureTest(
   groupby/covariance_tests.cpp
   groupby/groupby_test_util.cpp
   groupby/groups_tests.cpp
+  groupby/histogram_tests.cpp
   groupby/keys_tests.cpp
   groupby/lists_tests.cpp
   groupby/m2_tests.cpp
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
new file mode 100644
index 00000000000..c5833f40cf2
--- /dev/null
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/lists/sorting.hpp>
+#include <cudf/sorting.hpp>
+
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
+using structs_col = cudf::test::structs_column_wrapper;
+
+auto groupby_histogram(cudf::column_view const& keys,
+                       cudf::column_view const& values,
+                       cudf::aggregation::Kind agg_kind)
+{
+  CUDF_EXPECTS(
+    agg_kind == cudf::aggregation::HISTOGRAM || agg_kind == cudf::aggregation::MERGE_HISTOGRAM,
+    "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back();
+  requests[0].values = values;
+  if (agg_kind == cudf::aggregation::HISTOGRAM) {
+    requests[0].aggregations.push_back(
+      cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
+  } else {
+    requests[0].aggregations.push_back(
+      cudf::make_merge_histogram_aggregation<cudf::groupby_aggregation>());
+  }
+
+  auto gb_obj              = cudf::groupby::groupby(cudf::table_view({keys}));
+  auto const agg_results   = gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  auto const agg_histogram = agg_results.second[0].results[0]->view();
+  EXPECT_EQ(agg_histogram.type().id(), cudf::type_id::LIST);
+  EXPECT_EQ(agg_histogram.null_count(), 0);
+
+  auto const histograms = cudf::lists_column_view{agg_histogram}.child();
+  EXPECT_EQ(histograms.num_children(), 2);
+  EXPECT_EQ(histograms.null_count(), 0);
+  EXPECT_EQ(histograms.child(1).null_count(), 0);
+
+  auto const key_sort_order = cudf::sorted_order(agg_results.first->view(), {}, {});
+  auto sorted_keys =
+    std::move(cudf::gather(agg_results.first->view(), *key_sort_order)->release().front());
+  auto const sorted_vals =
+    std::move(cudf::gather(cudf::table_view{{agg_histogram}}, *key_sort_order)->release().front());
+  auto sorted_histograms = cudf::lists::sort_lists(cudf::lists_column_view{*sorted_vals},
+                                                   cudf::order::ASCENDING,
+                                                   cudf::null_order::BEFORE,
+                                                   rmm::mr::get_current_device_resource());
+
+  return std::pair{std::move(sorted_keys), std::move(sorted_histograms)};
+}
+
+template <typename T>
+struct GroupbyHistogramTest : public cudf::test::BaseFixture {};
+
+template <typename T>
+struct GroupbyMergeHistogramTest : public cudf::test::BaseFixture {};
+
+// Avoid unsigned types, as the tests below have negative values in their input.
+using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                              cudf::test::FloatingPointTypes,
+                                              cudf::test::FixedPointTypes,
+                                              cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(GroupbyHistogramTest, HistogramTestTypes);
+TYPED_TEST_SUITE(GroupbyMergeHistogramTest, HistogramTestTypes);
+
+TYPED_TEST(GroupbyHistogramTest, EmptyInput)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  auto const keys   = int32s_col{};
+  auto const values = col_data{};
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+
+  // The structure of the output is already verified in the function `groupby_histogram`.
+  ASSERT_EQ(res_histogram->size(), 0);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SimpleInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: values = [2, 2, -3, -2, 2]
+  // key = 1: values = [2, 0, 5, 2, 1]
+  // key = 2: values = [-3, 1, 1, 2, 2]
+  auto const keys   = int32s_col{2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, 0, 1, 2, 5, -3, 1, 2};
+      auto counts = int64s_col{1, 1, 3, 1, 1, 2, 1, 1, 2, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 7, 10}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SlicedInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  auto const keys_original = int32s_col{2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values_original =
+    col_data{1, 2, 0, 2, 1, -3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
+  // key = 0: values = [2, 2, -3, -2, 2]
+  // key = 1: values = [2, 0, 5, 2, 1]
+  // key = 2: values = [-3, 1, 1, 2, 2]
+  auto const keys   = cudf::slice(keys_original, {5, 20})[0];
+  auto const values = cudf::slice(values_original, {5, 20})[0];
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, 0, 1, 2, 5, -3, 1, 2};
+      auto counts = int64s_col{1, 1, 3, 1, 1, 2, 1, 1, 2, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 7, 10}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, InputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  // key = 0: values = [-3, null, 2, null, 2]
+  // key = 1: values = [1, 2, null, 5, 2, -3, 1, 1]
+  // key = 2: values = [null, 2, 0, -2, 2, null, 2]
+  auto const keys = int32s_col{2, 0, 2, 1, 1, 1, 2, 1, 1, 0, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values =
+    col_data{{null, -3, 2, 1, 2, null, 0, 5, 2, null, -3, -2, 2, null, 1, 2, null, 1, 2, 2},
+             nulls_at({0, 5, 9, 13, 16})};
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, -3, 2, null, -3, 1, 2, 5, null, -2, 0, 2}, nulls_at({0, 3, 8})};
+      auto counts = int64s_col{2, 1, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 8, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SlicedInputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  auto const keys_original =
+    int32s_col{1, 0, 2, 2, 0, 2, 0, 2, 1, 1, 1, 2, 1, 1, 0, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 0, 1, 2};
+  auto const values_original =
+    col_data{{null, 1,  1,  2, 1,    null, -3, 2,    1, 2, null, 0,    5, 2,
+              null, -3, -2, 2, null, 1,    2,  null, 1, 2, 2,    null, 1, 2},
+             nulls_at({0, 5, 10, 14, 18, 21, 25})};
+
+  // key = 0: values = [-3, null, 2, null, 2]
+  // key = 1: values = [1, 2, null, 5, 2, -3, 1, 1]
+  // key = 2: values = [null, 2, 0, -2, 2, null, 2]
+  auto const keys   = cudf::slice(keys_original, {5, 25})[0];
+  auto const values = cudf::slice(values_original, {5, 25})[0];
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, -3, 2, null, -3, 1, 2, 5, null, -2, 0, 2}, nulls_at({0, 3, 8})};
+      auto counts = int64s_col{2, 1, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 8, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, EmptyInput)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  auto const keys   = int32s_col{};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{};
+      auto counts = int64s_col{};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      0, int32s_col{}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
+
+  // The structure of the output is already verified in the function `groupby_histogram`.
+  ASSERT_EQ(res_histogram->size(), 0);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SimpleInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: histograms = [[<-3, 1>, <-2, 1>, <2, 3>], [<0, 1>, <1, 1>], [<-3, 3>, <0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<-2, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys   = int32s_col{0, 1, 0, 1, 0};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, -2, 1, 2, 0, 1, 0, 1, 2, -3, 0, 1};
+      auto counts = int64s_col{1, 1, 3, 1, 3, 2, 1, 1, 2, 1, 2, 3, 1, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      5, int32s_col{0, 3, 6, 8, 11, 14}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 0, 1, 2, -2, 0, 1, 2};
+      auto counts = int64s_col{4, 1, 2, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      2, int32s_col{0, 5, 9}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SlicedInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: histograms = [[<-3, 1>, <-2, 1>, <2, 3>], [<0, 1>, <1, 1>], [<-3, 3>, <0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<-2, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys_original   = int32s_col{0, 1, 0, 1, 0, 1, 0};
+  auto const values_original = [] {
+    auto structs = [] {
+      auto values = col_data{0, 2, -3, 1, -3, -2, 2, -2, 1, 2, 0, 1, 0, 1, 2, -3, 0, 1};
+      auto counts = int64s_col{1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 1, 1, 2, 1, 2, 3, 1, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(7,
+                                   int32s_col{0, 2, 4, 7, 10, 12, 15, 18}.release(),
+                                   structs.release(),
+                                   0,
+                                   rmm::device_buffer{});
+  }();
+  auto const keys   = cudf::slice(keys_original, {2, 7})[0];
+  auto const values = cudf::slice(*values_original, {2, 7})[0];
+
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 0, 1, 2, -2, 0, 1, 2};
+      auto counts = int64s_col{4, 1, 2, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      2, int32s_col{0, 5, 9}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, InputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  // key = 0: histograms = [[<null, 1>, <2, 3>], [<null, 2>, <1, 1>], [<0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<null, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys   = int32s_col{0, 1, 1, 0, 0};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 2, null, 1, 2, 0, 1, 2, null, 1, 0, 1}, nulls_at({0, 2, 8})};
+      auto counts = int64s_col{1, 3, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      5, int32s_col{0, 2, 5, 8, 10, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 0, 1, 2, null, 0, 1, 2}, nulls_at({0, 4})};
+      auto counts = int64s_col{3, 1, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      2, int32s_col{0, 4, 8}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SlicedInputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  // key = 0: histograms = [[<null, 1>, <2, 3>], [<null, 2>, <1, 1>], [<0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<null, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys_original   = int32s_col{0, 1, 0, 1, 1, 0, 0};
+  auto const values_original = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 2, null, 1, null, 2, null, 1, 2, 0, 1, 2, null, 1, 0, 1},
+                             nulls_at({0, 2, 4, 6, 12})};
+      auto counts = int64s_col{1, 3, 2, 1, 1, 3, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(7,
+                                   int32s_col{0, 2, 4, 6, 9, 12, 14, 16}.release(),
+                                   structs.release(),
+                                   0,
+                                   rmm::device_buffer{});
+  }();
+  auto const keys   = cudf::slice(keys_original, {2, 7})[0];
+  auto const values = cudf::slice(*values_original, {2, 7})[0];
+
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 0, 1, 2, null, 0, 1, 2}, nulls_at({0, 4})};
+      auto counts = int64s_col{3, 1, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      2, int32s_col{0, 4, 8}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 2561f3f9886..7644ac48892 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -28,6 +28,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
@@ -379,6 +380,212 @@ TYPED_TEST(ReductionTest, SumOfSquare)
             expected_null_value);
 }
 
+auto histogram_reduction(cudf::column_view const& input,
+                         std::unique_ptr<cudf::reduce_aggregation> const& agg)
+{
+  CUDF_EXPECTS(
+    agg->kind == cudf::aggregation::HISTOGRAM || agg->kind == cudf::aggregation::MERGE_HISTOGRAM,
+    "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
+
+  auto const result_scalar = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+  EXPECT_EQ(result_scalar->is_valid(), true);
+
+  auto const result_list_scalar = dynamic_cast<cudf::list_scalar*>(result_scalar.get());
+  EXPECT_NE(result_list_scalar, nullptr);
+
+  auto const histogram = result_list_scalar->view();
+  EXPECT_EQ(histogram.num_children(), 2);
+  EXPECT_EQ(histogram.null_count(), 0);
+  EXPECT_EQ(histogram.child(1).null_count(), 0);
+
+  // Sort the histogram based on the first column (unique input values).
+  auto const sort_order = cudf::sorted_order(cudf::table_view{{histogram.child(0)}}, {}, {});
+  return std::move(cudf::gather(cudf::table_view{{histogram}}, *sort_order)->release().front());
+}
+
+template <typename T>
+struct ReductionHistogramTest : public cudf::test::BaseFixture {};
+
+// Avoid unsigned types, as the tests below have negative values in their input.
+using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                              cudf::test::FloatingPointTypes,
+                                              cudf::test::FixedPointTypes,
+                                              cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(ReductionHistogramTest, HistogramTestTypes);
+
+TYPED_TEST(ReductionHistogramTest, Histogram)
+{
+  using data_col    = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using int64_col   = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const agg = cudf::make_histogram_aggregation<reduce_aggregation>();
+
+  // Empty input.
+  {
+    auto const input    = data_col{};
+    auto const expected = [] {
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  {
+    auto const input    = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+    auto const expected = [] {
+      auto child1 = data_col{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test without nulls, sliced input.
+  {
+    auto const input_original = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+    auto const input          = cudf::slice(input_original, {0, 7})[0];
+    auto const expected       = [] {
+      auto child1 = data_col{-3, 0, 1, 2, 5};
+      auto child2 = int64_col{1, 1, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls.
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+  {
+    auto const input    = data_col{{null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1},
+                                nulls_at({0, 6, 9, 12})};
+    auto const expected = [] {
+      auto child1 = data_col{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{4, 2, 1, 1, 2, 4, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls, sliced input.
+  {
+    auto const input_original = data_col{
+      {null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1}, nulls_at({0, 6, 9, 12})};
+    auto const input    = cudf::slice(input_original, {0, 9})[0];
+    auto const expected = [] {
+      auto child1 = data_col{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{2, 1, 1, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TYPED_TEST(ReductionHistogramTest, MergeHistogram)
+{
+  using data_col    = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using int64_col   = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const agg = cudf::make_merge_histogram_aggregation<reduce_aggregation>();
+
+  // Empty input.
+  {
+    auto const input = [] {
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const expected = [] {
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test without nulls.
+  {
+    auto const input = [] {
+      auto child1 = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      return structs_col{{child1, child2}};
+    }();
+
+    auto const expected = [] {
+      auto child1 = data_col{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64_col{5, 5, 4, 5, 8, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test without nulls, sliced input.
+  {
+    auto const input_original = [] {
+      auto child1 = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      return structs_col{{child1, child2}};
+    }();
+    auto const input = cudf::slice(input_original, {0, 7})[0];
+
+    auto const expected = [] {
+      auto child1 = data_col{-3, 0, 1, 2, 5};
+      auto child2 = int64_col{2, 4, 1, 5, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls.
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+  {
+    auto const input = [] {
+      auto child1 = data_col{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+                             nulls_at({2, 5, 8, 11, 15})};
+      auto child2 = int64_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      return structs_col{{child1, child2}};
+    }();
+
+    auto const expected = [] {
+      auto child1 = data_col{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{67, 5, 5, 4, 5, 8, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls, sliced input.
+  {
+    auto const input_original = [] {
+      auto child1 = data_col{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+                             nulls_at({2, 5, 8, 11, 15})};
+      auto child2 = int64_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      return structs_col{{child1, child2}};
+    }();
+    auto const input = cudf::slice(input_original, {0, 9})[0];
+
+    auto const expected = [] {
+      auto child1 = data_col{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{33, 2, 4, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
 template <typename T>
 struct ReductionAnyAllTest : public ReductionTest<bool> {};
 using AnyAllTypes = cudf::test::Types<int32_t, float, bool>;

From a97020f9c7e4e2be86788b5f7d83608839d3207b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 27 Sep 2023 13:33:48 -0400
Subject: [PATCH 221/230] Correct numerous 20054-D: dynamic initialization
 errors found on arm+12.2 (#14108)

Compile issues found by compiling libcudf with the `rapidsai/devcontainers:23.10-cpp-gcc9-cuda12.2-ubuntu20.04` docker container.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14108
---
 cpp/src/io/avro/avro_common.hpp             |  3 +-
 cpp/src/io/comp/unsnap.cu                   | 18 +++---
 cpp/src/io/orc/orc_gpu.hpp                  | 39 +++++-------
 cpp/src/io/orc/stats_enc.cu                 | 10 +--
 cpp/src/io/orc/stripe_init.cu               | 29 ++++-----
 cpp/src/io/parquet/page_decode.cuh          | 67 +++++++++++----------
 cpp/src/io/parquet/page_hdr.cu              | 12 ++--
 cpp/src/io/parquet/parquet_gpu.hpp          | 56 ++++++++---------
 cpp/src/io/statistics/column_statistics.cuh | 12 ++--
 cpp/src/io/statistics/statistics.cuh        | 30 ++++-----
 10 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index ff8ee206dd4..0058d236d8c 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -25,7 +25,8 @@ namespace cudf {
 namespace io {
 namespace avro {
 struct block_desc_s {
-  block_desc_s() {}
+  block_desc_s() = default;  // required to compile on ctk-12.2 + aarch64
+
   explicit constexpr block_desc_s(
     size_t offset_, uint32_t size_, uint32_t row_offset_, uint32_t first_row_, uint32_t num_rows_)
     : offset(offset_),
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index c699502317f..504a2fe377c 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -52,6 +52,8 @@ struct unsnap_batch_s {
  * @brief Queue structure used to exchange data between warps
  */
 struct unsnap_queue_s {
+  unsnap_queue_s() = default;  // required to compile on ctk-12.2 + aarch64
+
   uint32_t prefetch_wrpos;         ///< Prefetcher write position
   uint32_t prefetch_rdpos;         ///< Prefetch consumer read position
   int32_t prefetch_end;            ///< Prefetch enable flag (nonzero stops prefetcher)
@@ -64,13 +66,15 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  uint8_t const* base;             ///< base ptr of compressed stream
-  uint8_t const* end;              ///< end of compressed stream
-  uint32_t uncompressed_size;      ///< uncompressed stream size
-  uint32_t bytes_left;             ///< remaining bytes to decompress
-  int32_t error;                   ///< current error status
-  uint32_t tstart;                 ///< start time for perf logging
-  volatile unsnap_queue_s q;       ///< queue for cross-warp communication
+  constexpr unsnap_state_s() noexcept {}  // required to compile on ctk-12.2 + aarch64
+
+  uint8_t const* base{};           ///< base ptr of compressed stream
+  uint8_t const* end{};            ///< end of compressed stream
+  uint32_t uncompressed_size{};    ///< uncompressed stream size
+  uint32_t bytes_left{};           ///< remaining bytes to decompress
+  int32_t error{};                 ///< current error status
+  uint32_t tstart{};               ///< start time for perf logging
+  volatile unsnap_queue_s q{};     ///< queue for cross-warp communication
   device_span<uint8_t const> src;  ///< input for current block
   device_span<uint8_t> dst;        ///< output for current block
 };
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 9b8df50a22a..dba7a9ffda5 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -59,31 +59,24 @@ struct CompressedStreamInfo {
   explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_)
     : compressed_data(compressed_data_),
       uncompressed_data(nullptr),
-      compressed_data_size(compressed_size_),
-      dec_in_ctl(nullptr),
-      dec_out_ctl(nullptr),
-      copy_in_ctl(nullptr),
-      copy_out_ctl(nullptr),
-      num_compressed_blocks(0),
-      num_uncompressed_blocks(0),
-      max_uncompressed_size(0),
-      max_uncompressed_block_size(0)
+      compressed_data_size(compressed_size_)
   {
   }
-  uint8_t const* compressed_data;  // [in] base ptr to compressed stream data
-  uint8_t* uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
-  size_t compressed_data_size;              // [in] compressed data size for this stream
-  device_span<uint8_t const>* dec_in_ctl;   // [in] input buffer to decompress
-  device_span<uint8_t>* dec_out_ctl;        // [in] output buffer to decompress into
-  device_span<compression_result> dec_res;  // [in] results of decompression
-  device_span<uint8_t const>* copy_in_ctl;  // [out] input buffer to copy
-  device_span<uint8_t>* copy_out_ctl;       // [out] output buffer to copy to
-  uint32_t num_compressed_blocks;  // [in,out] number of entries in decctl(in), number of compressed
-                                   // blocks(out)
-  uint32_t num_uncompressed_blocks;      // [in,out] number of entries in dec_in_ctl(in), number of
-                                         // uncompressed blocks(out)
-  uint64_t max_uncompressed_size;        // [out] maximum uncompressed data size of stream
-  uint32_t max_uncompressed_block_size;  // [out] maximum uncompressed size of any block in stream
+  uint8_t const* compressed_data{};  // [in] base ptr to compressed stream data
+  uint8_t*
+    uncompressed_data{};  // [in] base ptr to uncompressed stream data or NULL if not known yet
+  size_t compressed_data_size{};              // [in] compressed data size for this stream
+  device_span<uint8_t const>* dec_in_ctl{};   // [in] input buffer to decompress
+  device_span<uint8_t>* dec_out_ctl{};        // [in] output buffer to decompress into
+  device_span<compression_result> dec_res{};  // [in] results of decompression
+  device_span<uint8_t const>* copy_in_ctl{};  // [out] input buffer to copy
+  device_span<uint8_t>* copy_out_ctl{};       // [out] output buffer to copy to
+  uint32_t num_compressed_blocks{};           // [in,out] number of entries in decctl(in), number of
+                                              // compressed blocks(out)
+  uint32_t num_uncompressed_blocks{};  // [in,out] number of entries in dec_in_ctl(in), number of
+                                       // uncompressed blocks(out)
+  uint64_t max_uncompressed_size{};    // [out] maximum uncompressed data size of stream
+  uint32_t max_uncompressed_block_size{};  // [out] maximum uncompressed size of any block in stream
 };
 
 enum StreamIndexType {
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 69d7ec95acd..95f1db5bfd1 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -134,11 +134,11 @@ __global__ void __launch_bounds__(block_size, 1)
 }
 
 struct stats_state_s {
-  uint8_t* base;  ///< Output buffer start
-  uint8_t* end;   ///< Output buffer end
-  statistics_chunk chunk;
-  statistics_merge_group group;
-  statistics_dtype stats_dtype;  //!< Statistics data type for this column
+  uint8_t* base{};  ///< Output buffer start
+  uint8_t* end{};   ///< Output buffer end
+  statistics_chunk chunk{};
+  statistics_merge_group group{};
+  statistics_dtype stats_dtype{};  //!< Statistics data type for this column
 };
 
 /*
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index d8a60350356..8eeca504121 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -30,14 +30,14 @@ namespace orc {
 namespace gpu {
 
 struct comp_in_out {
-  uint8_t const* in_ptr;
-  size_t in_size;
-  uint8_t* out_ptr;
-  size_t out_size;
+  uint8_t const* in_ptr{};
+  size_t in_size{};
+  uint8_t* out_ptr{};
+  size_t out_size{};
 };
 struct compressed_stream_s {
-  CompressedStreamInfo info;
-  comp_in_out ctl;
+  CompressedStreamInfo info{};
+  comp_in_out ctl{};
 };
 
 // blockDim {128,1,1}
@@ -208,14 +208,15 @@ __global__ void __launch_bounds__(128, 8)
  * @brief Shared mem state for gpuParseRowGroupIndex
  */
 struct rowindex_state_s {
-  ColumnDesc chunk;
-  uint32_t rowgroup_start;
-  uint32_t rowgroup_end;
-  int is_compressed;
-  uint32_t row_index_entry[3][CI_PRESENT];  // NOTE: Assumes CI_PRESENT follows CI_DATA and CI_DATA2
-  CompressedStreamInfo strm_info[2];
-  RowGroup rowgroups[128];
-  uint32_t compressed_offset[128][2];
+  ColumnDesc chunk{};
+  uint32_t rowgroup_start{};
+  uint32_t rowgroup_end{};
+  int is_compressed{};
+  uint32_t row_index_entry[3]
+                          [CI_PRESENT]{};  // NOTE: Assumes CI_PRESENT follows CI_DATA and CI_DATA2
+  CompressedStreamInfo strm_info[2]{};
+  RowGroup rowgroups[128]{};
+  uint32_t compressed_offset[128][2]{};
 };
 
 enum row_entry_state_e {
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 26e3c951b2e..5e66885d746 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -26,48 +26,49 @@
 namespace cudf::io::parquet::gpu {
 
 struct page_state_s {
-  uint8_t const* data_start;
-  uint8_t const* data_end;
-  uint8_t const* lvl_end;
-  uint8_t const* dict_base;    // ptr to dictionary page data
-  int32_t dict_size;           // size of dictionary data
-  int32_t first_row;           // First row in page to output
-  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
-  int32_t first_output_value;  // First value in page to output
-  int32_t num_input_values;    // total # of input/level values in the page
-  int32_t dtype_len;           // Output data type length
-  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
-  int32_t dict_bits;           // # of bits to store dictionary indices
-  uint32_t dict_run;
-  int32_t dict_val;
-  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
-  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
-  int32_t error;
-  PageInfo page;
-  ColumnChunkDesc col;
+  constexpr page_state_s() noexcept {}
+  uint8_t const* data_start{};
+  uint8_t const* data_end{};
+  uint8_t const* lvl_end{};
+  uint8_t const* dict_base{};    // ptr to dictionary page data
+  int32_t dict_size{};           // size of dictionary data
+  int32_t first_row{};           // First row in page to output
+  int32_t num_rows{};            // Rows in page to decode (including rows to be skipped)
+  int32_t first_output_value{};  // First value in page to output
+  int32_t num_input_values{};    // total # of input/level values in the page
+  int32_t dtype_len{};           // Output data type length
+  int32_t dtype_len_in{};        // Can be larger than dtype_len if truncating 32-bit into 8-bit
+  int32_t dict_bits{};           // # of bits to store dictionary indices
+  uint32_t dict_run{};
+  int32_t dict_val{};
+  uint32_t initial_rle_run[NUM_LEVEL_TYPES]{};   // [def,rep]
+  int32_t initial_rle_value[NUM_LEVEL_TYPES]{};  // [def,rep]
+  int32_t error{};
+  PageInfo page{};
+  ColumnChunkDesc col{};
 
   // (leaf) value decoding
-  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
-  int32_t dict_pos;  // write position of dictionary indices
-  int32_t src_pos;   // input read position of final output value
-  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
+  int32_t nz_count{};  // number of valid entries in nz_idx (write position in circular buffer)
+  int32_t dict_pos{};  // write position of dictionary indices
+  int32_t src_pos{};   // input read position of final output value
+  int32_t ts_scale{};  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
 
   // repetition/definition level decoding
-  int32_t input_value_count;                  // how many values of the input we've processed
-  int32_t input_row_count;                    // how many rows of the input we've processed
-  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  uint8_t const* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
-  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
-  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
+  int32_t input_value_count{};                  // how many values of the input we've processed
+  int32_t input_row_count{};                    // how many rows of the input we've processed
+  int32_t input_leaf_count{};                   // how many leaf values of the input we've processed
+  uint8_t const* lvl_start[NUM_LEVEL_TYPES]{};  // [def,rep]
+  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES]{};  // [def,rep]
+  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES]{};    // [def,rep]
+  int32_t lvl_count[NUM_LEVEL_TYPES]{};             // how many of each of the streams we've decoded
+  int32_t row_index_lower_bound{};                  // lower bound of row indices we should process
 
   // a shared-memory cache of frequently used data when decoding. The source of this data is
   // normally stored in global memory which can yield poor performance. So, when possible
   // we copy that info here prior to decoding
-  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
+  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info]{};
   // points to either nesting_decode_cache above when possible, or to the global source otherwise
-  PageNestingDecodeInfo* nesting_info;
+  PageNestingDecodeInfo* nesting_info{};
 };
 
 // buffers only used in the decode kernel.  separated from page_state_s to keep
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 0d611643b46..6f8b2f50443 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -45,13 +45,13 @@ static const __device__ __constant__ uint8_t g_list2struct[16] = {0,
                                                                   ST_FLD_LIST};
 
 struct byte_stream_s {
-  uint8_t const* cur;
-  uint8_t const* end;
-  uint8_t const* base;
+  uint8_t const* cur{};
+  uint8_t const* end{};
+  uint8_t const* base{};
   // Parsed symbols
-  PageType page_type;
-  PageInfo page;
-  ColumnChunkDesc ck;
+  PageType page_type{};
+  PageInfo page{};
+  ColumnChunkDesc ck{};
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a3cc37dee4f..a760c2448dc 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -228,7 +228,7 @@ struct PageInfo {
  * @brief Struct describing a particular chunk of column data
  */
 struct ColumnChunkDesc {
-  ColumnChunkDesc() = default;
+  constexpr ColumnChunkDesc() noexcept {};
   explicit ColumnChunkDesc(size_t compressed_size_,
                            uint8_t* compressed_data_,
                            size_t num_values_,
@@ -275,34 +275,34 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t const* compressed_data;                  // pointer to compressed column chunk data
-  size_t compressed_size;                          // total compressed data size for this chunk
-  size_t num_values;                               // total number of values in this column
-  size_t start_row;                                // starting row of this chunk
-  uint32_t num_rows;                               // number of rows in this chunk
-  int16_t max_level[level_type::NUM_LEVEL_TYPES];  // max definition/repetition level
-  int16_t max_nesting_depth;                       // max nesting depth of the output
-  uint16_t data_type;                              // basic column data type, ((type_length << 3) |
-                                                   // parquet::Type)
+  uint8_t const* compressed_data{};                  // pointer to compressed column chunk data
+  size_t compressed_size{};                          // total compressed data size for this chunk
+  size_t num_values{};                               // total number of values in this column
+  size_t start_row{};                                // starting row of this chunk
+  uint32_t num_rows{};                               // number of rows in this chunk
+  int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
+  int16_t max_nesting_depth{};                       // max nesting depth of the output
+  uint16_t data_type{};  // basic column data type, ((type_length << 3) |
+                         // parquet::Type)
   uint8_t
-    level_bits[level_type::NUM_LEVEL_TYPES];  // bits to encode max definition/repetition levels
-  int32_t num_data_pages;                     // number of data pages
-  int32_t num_dict_pages;                     // number of dictionary pages
-  int32_t max_num_pages;                      // size of page_info array
-  PageInfo* page_info;                        // output page info for up to num_dict_pages +
-                                              // num_data_pages (dictionary pages first)
-  string_index_pair* str_dict_index;          // index for string dictionary
-  bitmask_type** valid_map_base;              // base pointers of valid bit map for this column
-  void** column_data_base;                    // base pointers of column data
-  void** column_string_base;                  // base pointers of column string data
-  int8_t codec;                               // compressed codec enum
-  int8_t converted_type;                      // converted type enum
-  LogicalType logical_type;                   // logical type
-  int8_t decimal_precision;                   // Decimal precision
-  int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
-
-  int32_t src_col_index;   // my input column index
-  int32_t src_col_schema;  // my schema index in the file
+    level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
+  int32_t num_data_pages{};                     // number of data pages
+  int32_t num_dict_pages{};                     // number of dictionary pages
+  int32_t max_num_pages{};                      // size of page_info array
+  PageInfo* page_info{};                        // output page info for up to num_dict_pages +
+                                                // num_data_pages (dictionary pages first)
+  string_index_pair* str_dict_index{};          // index for string dictionary
+  bitmask_type** valid_map_base{};              // base pointers of valid bit map for this column
+  void** column_data_base{};                    // base pointers of column data
+  void** column_string_base{};                  // base pointers of column string data
+  int8_t codec{};                               // compressed codec enum
+  int8_t converted_type{};                      // converted type enum
+  LogicalType logical_type{};                   // logical type
+  int8_t decimal_precision{};                   // Decimal precision
+  int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
+
+  int32_t src_col_index{};   // my input column index
+  int32_t src_col_schema{};  // my schema index in the file
 };
 
 /**
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index 28e77f62a43..f71fb95949f 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -34,18 +34,18 @@ namespace io {
  * @brief shared state for statistics calculation kernel
  */
 struct stats_state_s {
-  stats_column_desc col;   ///< Column information
-  statistics_group group;  ///< Group description
-  statistics_chunk ck;     ///< Output statistics chunk
+  stats_column_desc col{};   ///< Column information
+  statistics_group group{};  ///< Group description
+  statistics_chunk ck{};     ///< Output statistics chunk
 };
 
 /**
  * @brief shared state for statistics merge kernel
  */
 struct merge_state_s {
-  stats_column_desc col;         ///< Column information
-  statistics_merge_group group;  ///< Group description
-  statistics_chunk ck;           ///< Resulting statistics chunk
+  stats_column_desc col{};         ///< Column information
+  statistics_merge_group group{};  ///< Group description
+  statistics_chunk ck{};           ///< Resulting statistics chunk
 };
 
 template <int dimension>
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index 805ca43553e..b6e698fee11 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -98,27 +98,27 @@ union statistics_val {
 };
 
 struct statistics_chunk {
-  uint32_t non_nulls;        //!< number of non-null values in chunk
-  uint32_t null_count;       //!< number of null values in chunk
-  statistics_val min_value;  //!< minimum value in chunk
-  statistics_val max_value;  //!< maximum value in chunk
-  statistics_val sum;        //!< sum of chunk
-  uint8_t has_minmax;        //!< Nonzero if min_value and max_values are valid
-  uint8_t has_sum;           //!< Nonzero if sum is valid
+  uint32_t non_nulls{};        //!< number of non-null values in chunk
+  uint32_t null_count{};       //!< number of null values in chunk
+  statistics_val min_value{};  //!< minimum value in chunk
+  statistics_val max_value{};  //!< maximum value in chunk
+  statistics_val sum{};        //!< sum of chunk
+  uint8_t has_minmax{};        //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum{};           //!< Nonzero if sum is valid
 };
 
 struct statistics_group {
-  stats_column_desc const* col;  //!< Column information
-  uint32_t start_row;            //!< Start row of this group
-  uint32_t num_rows;             //!< Number of rows in group
-  uint32_t non_leaf_nulls;       //!< Number of null non-leaf values in the group
+  stats_column_desc const* col{};  //!< Column information
+  uint32_t start_row{};            //!< Start row of this group
+  uint32_t num_rows{};             //!< Number of rows in group
+  uint32_t non_leaf_nulls{};       //!< Number of null non-leaf values in the group
 };
 
 struct statistics_merge_group {
-  data_type col_dtype;           //!< Column data type
-  statistics_dtype stats_dtype;  //!< Statistics data type for this column
-  uint32_t start_chunk;          //!< Start chunk of this group
-  uint32_t num_chunks;           //!< Number of chunks in group
+  data_type col_dtype;                       //!< Column data type
+  statistics_dtype stats_dtype{dtype_none};  //!< Statistics data type for this column
+  uint32_t start_chunk{};                    //!< Start chunk of this group
+  uint32_t num_chunks{};                     //!< Number of chunks in group
 };
 
 template <typename T, std::enable_if_t<!std::is_same_v<T, statistics::byte_array_view>>* = nullptr>

From bff0fcd721320210c53d3533e63fb34eac883f4e Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Wed, 27 Sep 2023 11:25:25 -0700
Subject: [PATCH 222/230] [Java] Add JNI bindings for `integers_to_hex`
 (#14205)

This PR adds a method to ColumnView class to allow for conversion from Integers to hex
closes #14081

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Kuhu Shukla (https://github.com/kuhushukla)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/14205
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 27 +++++++++++++++++++
 java/src/main/java/ai/rapids/cudf/DType.java  | 19 +++++++++++++
 java/src/main/native/src/ColumnViewJni.cpp    |  9 +++++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 10 +++++++
 4 files changed, 65 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 3f3a55f0970..0b66701629b 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -4089,6 +4089,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) {
 
   private static native long isFixedPoint(long viewHandle, int nativeTypeId, int scale);
 
+  private static native long toHex(long viewHandle);
+
   /**
    * Native method to concatenate a list column of strings (each row is a list of strings),
    * concatenates the strings within each row and returns a single strings column result.
@@ -5231,4 +5233,29 @@ static ColumnView[] getColumnViewsFromPointers(long[] nativeHandles) {
       }
     }
   }
+
+  /**
+   * Convert this integer column to hexadecimal column and return a new strings column
+   *
+   * Any null entries will result in corresponding null entries in the output column.
+   *
+   * The output character set is '0'-'9' and 'A'-'F'. The output string width will
+   * be a multiple of 2 depending on the size of the integer type. A single leading
+   * zero is applied to the first non-zero output byte if it is less than 0x10.
+   *
+   * Example:
+   * input = [123, -1, 0, 27, 342718233]
+   * s = input.toHex()
+   * s is [ '04D2', 'FFFFFFFF', '00', '1B', '146D7719']
+   *
+   * The example above shows an `INT32` type column where each integer is 4 bytes.
+   * Leading zeros are suppressed unless filling out a complete byte as in
+   * `123 -> '04D2'` instead of `000004D2` or `4D2`.
+   *
+   * @return new string ColumnVector
+   */
+  public ColumnVector toHex() {
+    assert getType().isIntegral() : "Only integers are supported";
+    return new ColumnVector(toHex(this.getNativeView()));
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index d0bb7761da4..07bc4fe3bbf 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -413,6 +413,14 @@ public boolean isDurationType() {
   }
 
   /**
+   * Returns true for strictly Integer types not a type backed by
+   * ints
+   */
+  public boolean isIntegral() {
+    return INTEGRALS.contains(this.typeId);
+  }
+
+ /**
    * Returns true for nested types
    */
   public boolean isNestedType() {
@@ -506,4 +514,15 @@ public boolean hasOffsets() {
       DTypeEnum.STRING,
       DTypeEnum.LIST
   );
+
+  private static final EnumSet<DTypeEnum> INTEGRALS = EnumSet.of(
+    DTypeEnum.INT8,
+    DTypeEnum.INT16,
+    DTypeEnum.INT32,
+    DTypeEnum.INT64,
+    DTypeEnum.UINT8,
+    DTypeEnum.UINT16,
+    DTypeEnum.UINT32,
+    DTypeEnum.UINT64
+  );
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index d5aad03645f..0ddaa2c15b5 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -2563,4 +2563,13 @@ Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv *env, jclass, jlong col
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv *env, jclass, jlong input_ptr) {
+  JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    return release_as_jlong(cudf::strings::integers_to_hex(*input));
+  }
+  CATCH_STD(env, 0);
+}
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index f6dffc88b92..9a0f8bda994 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -6876,4 +6876,14 @@ public void testUseAfterFree() {
     vector.close();
     assertThrows(NullPointerException.class, vector::getDeviceMemorySize);
   }
+
+  @Test
+  public void testConvertIntegerToHex() {
+    try (
+      ColumnVector input = ColumnVector.fromInts(14, 2621, 50);
+      ColumnVector expected = ColumnVector.fromStrings("0E", "0A3D", "32");
+      ColumnVector actual = input.toHex()) {
+        assertColumnsAreEqual(expected, actual);
+    }
+  }
 }

From 66ac962dbeb69eade22b3bcaf186e3df2bae71b5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 27 Sep 2023 12:20:20 -0700
Subject: [PATCH 223/230] JNI for `HISTOGRAM` and `MERGE_HISTOGRAM`
 aggregations (#14154)

This implements JNI for  `HISTOGRAM` and `MERGE_HISTOGRAM` aggregations in both groupby and reduction.

Depends on:
 * https://github.com/rapidsai/cudf/pull/14045

Contributes to:
 * https://github.com/rapidsai/cudf/issues/13885.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/14154
---
 .../main/java/ai/rapids/cudf/Aggregation.java |  26 ++++-
 .../ai/rapids/cudf/GroupByAggregation.java    |  24 +++-
 .../ai/rapids/cudf/ReductionAggregation.java  |  20 +++-
 java/src/main/native/src/AggregationJni.cpp   |   7 +-
 .../test/java/ai/rapids/cudf/TableTest.java   | 109 ++++++++++++++++++
 5 files changed, 181 insertions(+), 5 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index d10329ca0f2..379750bb0b7 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -68,7 +68,9 @@ enum Kind {
         DENSE_RANK(29),
         PERCENT_RANK(30),
         TDIGEST(31), // This can take a delta argument for accuracy level
-        MERGE_TDIGEST(32); // This can take a delta argument for accuracy level
+        MERGE_TDIGEST(32), // This can take a delta argument for accuracy level
+        HISTOGRAM(33),
+        MERGE_HISTOGRAM(34);
 
         final int nativeId;
 
@@ -918,6 +920,26 @@ static TDigestAggregation mergeTDigest(int delta) {
         return new TDigestAggregation(Kind.MERGE_TDIGEST, delta);
     }
 
+    static final class HistogramAggregation extends NoParamAggregation {
+        private HistogramAggregation() {
+            super(Kind.HISTOGRAM);
+        }
+    }
+
+    static final class MergeHistogramAggregation extends NoParamAggregation {
+        private MergeHistogramAggregation() {
+            super(Kind.MERGE_HISTOGRAM);
+        }
+    }
+
+    static HistogramAggregation histogram() {
+        return new HistogramAggregation();
+    }
+
+    static MergeHistogramAggregation mergeHistogram() {
+        return new MergeHistogramAggregation();
+    }
+
     /**
      * Create one of the aggregations that only needs a kind, no other parameters. This does not
      * work for all types and for code safety reasons each kind is added separately.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 500d18f7eae..0fae33927b6 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -315,4 +315,26 @@ public static GroupByAggregation createTDigest(int delta) {
   public static GroupByAggregation mergeTDigest(int delta) {
     return new GroupByAggregation(Aggregation.mergeTDigest(delta));
   }
+
+  /**
+   * Histogram aggregation, computing the frequencies for each unique row.
+   *
+   * A histogram is given as a lists column, in which the first child stores unique rows from
+   * the input values and the second child stores their corresponding frequencies.
+   *
+   * @return A lists of structs column in which each list contains a histogram corresponding to
+   *         an input key.
+   */
+  public static GroupByAggregation histogram() {
+    return new GroupByAggregation(Aggregation.histogram());
+  }
+
+  /**
+   * MergeHistogram aggregation, to merge multiple histograms.
+   *
+   * @return A new histogram in which the frequencies of the unique rows are sum up.
+   */
+  public static GroupByAggregation mergeHistogram() {
+    return new GroupByAggregation(Aggregation.mergeHistogram());
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
index eab1c94fd2c..ba8ae379bae 100644
--- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -286,4 +286,22 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual
     return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality));
   }
 
+  /**
+   * Create HistogramAggregation, computing the frequencies for each unique row.
+   *
+   * @return A structs column in which the first child stores unique rows from the input and the
+   *         second child stores their corresponding frequencies.
+   */
+  public static ReductionAggregation histogram() {
+    return new ReductionAggregation(Aggregation.histogram());
+  }
+
+  /**
+   * Create MergeHistogramAggregation, to merge multiple histograms.
+   *
+   * @return A new histogram in which the frequencies of the unique rows are sum up.
+   */
+  public static ReductionAggregation mergeHistogram() {
+    return new ReductionAggregation(Aggregation.mergeHistogram());
+  }
 }
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index 6ac73282615..bc62e95c36a 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,6 +90,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
         case 30: // ANSI SQL PERCENT_RANK
           return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
                                              {}, cudf::rank_percentage::ONE_NORMALIZED);
+        case 33: // HISTOGRAM
+          return cudf::make_histogram_aggregation();
+        case 34: // MERGE_HISTOGRAM
+          return cudf::make_merge_histogram_aggregation();
+
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
       }
     }();
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 59f0d180c6e..faa73ac4322 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -4129,6 +4129,115 @@ void testMergeTDigestReduction() {
     }
   }
 
+  @Test
+  void testGroupbyHistogram() {
+    StructType histogramStruct = new StructType(false,
+        new BasicType(false, DType.INT32), // values
+        new BasicType(false, DType.INT64)); // frequencies
+    ListType histogramList = new ListType(false, histogramStruct);
+
+    // key = 0: values = [2, 2, -3, -2, 2]
+    // key = 1: values = [2, 0, 5, 2, 1]
+    // key = 2: values = [-3, 1, 1, 2, 2]
+    try (Table input = new Table.TestBuilder()
+        .column(2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2)
+        .column(-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2)
+        .build();
+         Table result = input.groupBy(0)
+             .aggregate(GroupByAggregation.histogram().onColumn(1));
+         Table sortedResult = result.orderBy(OrderByArg.asc(0));
+         ColumnVector sortedOutHistograms = sortedResult.getColumn(1).listSortRows(false, false);
+
+         ColumnVector expectedKeys = ColumnVector.fromInts(0, 1, 2);
+         ColumnVector expectedHistograms = ColumnVector.fromLists(histogramList,
+             Arrays.asList(new StructData(-3, 1L), new StructData(-2, 1L), new StructData(2, 3L)),
+             Arrays.asList(new StructData(0, 1L), new StructData(1, 1L), new StructData(2, 2L),
+                 new StructData(5, 1L)),
+             Arrays.asList(new StructData(-3, 1L), new StructData(1, 2L), new StructData(2, 2L)))
+    ) {
+      assertColumnsAreEqual(expectedKeys, sortedResult.getColumn(0));
+      assertColumnsAreEqual(expectedHistograms, sortedOutHistograms);
+    }
+  }
+
+  @Test
+  void testGroupbyMergeHistogram() {
+    StructType histogramStruct = new StructType(false,
+        new BasicType(false, DType.INT32), // values
+        new BasicType(false, DType.INT64)); // frequencies
+    ListType histogramList = new ListType(false, histogramStruct);
+
+    // key = 0: histograms = [[<-3, 1>, <-2, 1>, <2, 3>], [<0, 1>, <1, 1>], [<-3, 3>, <0, 1>, <1, 2>]]
+    // key = 1: histograms = [[<-2, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+    try (Table input = new Table.TestBuilder()
+        .column(0, 1, 0, 1, 0)
+        .column(histogramStruct,
+            new StructData[]{new StructData(-3, 1L), new StructData(-2, 1L), new StructData(2, 3L)},
+            new StructData[]{new StructData(-2, 1L), new StructData(1, 3L), new StructData(2, 2L)},
+            new StructData[]{new StructData(0, 1L), new StructData(1, 1L)},
+            new StructData[]{new StructData(0, 2L), new StructData(1, 1L), new StructData(2, 2L)},
+            new StructData[]{new StructData(-3, 3L), new StructData(0, 1L), new StructData(1, 2L)})
+        .build();
+         Table result = input.groupBy(0)
+             .aggregate(GroupByAggregation.mergeHistogram().onColumn(1));
+         Table sortedResult = result.orderBy(OrderByArg.asc(0));
+         ColumnVector sortedOutHistograms = sortedResult.getColumn(1).listSortRows(false, false);
+
+         ColumnVector expectedKeys = ColumnVector.fromInts(0, 1);
+         ColumnVector expectedHistograms = ColumnVector.fromLists(histogramList,
+             Arrays.asList(new StructData(-3, 4L), new StructData(-2, 1L), new StructData(0, 2L),
+                           new StructData(1, 3L), new StructData(2, 3L)),
+             Arrays.asList(new StructData(-2, 1L), new StructData(0, 2L), new StructData(1, 4L),
+                           new StructData(2, 4L)))
+    ) {
+      assertColumnsAreEqual(expectedKeys, sortedResult.getColumn(0));
+      assertColumnsAreEqual(expectedHistograms, sortedOutHistograms);
+    }
+  }
+
+  @Test
+  void testReductionHistogram() {
+    StructType histogramStruct = new StructType(false,
+        new BasicType(false, DType.INT32), // values
+        new BasicType(false, DType.INT64)); // frequencies
+
+    try (ColumnVector input = ColumnVector.fromInts(-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1);
+         Scalar result = input.reduce(ReductionAggregation.histogram(), DType.LIST);
+         ColumnVector resultCV = result.getListAsColumnView().copyToColumnVector();
+         Table resultTable = new Table(resultCV);
+         Table sortedResult = resultTable.orderBy(OrderByArg.asc(0));
+
+         ColumnVector expectedHistograms = ColumnVector.fromStructs(histogramStruct,
+             new StructData(-3, 2L), new StructData(-2, 1L), new StructData(0, 1L),
+             new StructData(1, 2L), new StructData(2, 4L), new StructData(5, 1L))
+    ) {
+      assertColumnsAreEqual(expectedHistograms, sortedResult.getColumn(0));
+    }
+  }
+
+  @Test
+  void testReductionMergeHistogram() {
+    StructType histogramStruct = new StructType(false,
+        new BasicType(false, DType.INT32), // values
+        new BasicType(false, DType.INT64)); // frequencies
+
+    try (ColumnVector input = ColumnVector.fromStructs(histogramStruct,
+             new StructData(-3, 2L), new StructData(2, 1L), new StructData(1, 1L),
+             new StructData(2, 2L), new StructData(0, 4L), new StructData(5, 1L),
+             new StructData(2, 2L), new StructData(-3, 3L), new StructData(-2, 5L),
+             new StructData(2, 3L), new StructData(1, 4L));
+         Scalar result = input.reduce(ReductionAggregation.mergeHistogram(), DType.LIST);
+         ColumnVector resultCV = result.getListAsColumnView().copyToColumnVector();
+         Table resultTable = new Table(resultCV);
+         Table sortedResult = resultTable.orderBy(OrderByArg.asc(0));
+
+         ColumnVector expectedHistograms = ColumnVector.fromStructs(histogramStruct,
+             new StructData(-3, 5L), new StructData(-2, 5L), new StructData(0, 4L),
+             new StructData(1, 5L), new StructData(2, 8L), new StructData(5, 1L))
+    ) {
+      assertColumnsAreEqual(expectedHistograms, sortedResult.getColumn(0));
+    }
+  }
   @Test
   void testGroupByMinMaxDecimal() {
     try (Table t1 = new Table.TestBuilder()

From b789d4ce3c090a3f25a8657d9a8582a1edb54f12 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 27 Sep 2023 12:20:46 -0700
Subject: [PATCH 224/230] Preserve name of the column while initializing a
 `DataFrame` (#14110)

Fixes: #14088

This PR preserves `names` of `column` object while constructing a `DataFrame` through various constructor flows.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/14110
---
 python/cudf/cudf/core/column_accessor.py |  2 --
 python/cudf/cudf/core/dataframe.py       | 26 ++++++++++++++++++---
 python/cudf/cudf/core/indexed_frame.py   |  4 +++-
 python/cudf/cudf/tests/test_dataframe.py | 29 ++++++++++++++++++++----
 4 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index bec9c367ba9..cb79a30422e 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -197,8 +197,6 @@ def nlevels(self) -> int:
 
     @property
     def name(self) -> Any:
-        if len(self._data) == 0:
-            return None
         return self.level_names[-1]
 
     @property
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8a3dbe77787..ead2f182e2d 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -665,7 +665,10 @@ def __init__(
                             len(self), dtype="object", masked=True
                         )
                         for k in columns
-                    }
+                    },
+                    level_names=tuple(columns.names)
+                    if isinstance(columns, pd.Index)
+                    else None,
                 )
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
@@ -712,6 +715,11 @@ def __init__(
 
                     self._data = new_df._data
                     self._index = new_df._index
+                    self._data._level_names = (
+                        tuple(columns.names)
+                        if isinstance(columns, pd.Index)
+                        else self._data._level_names
+                    )
                 elif len(data) > 0 and isinstance(data[0], Series):
                     self._init_from_series_list(
                         data=data, columns=columns, index=index
@@ -834,6 +842,11 @@ def _init_from_series_list(self, data, columns, index):
                     self._data[col_name] = column.column_empty(
                         row_count=len(self), dtype=None, masked=True
                     )
+            self._data._level_names = (
+                tuple(columns.names)
+                if isinstance(columns, pd.Index)
+                else self._data._level_names
+            )
             self._data = self._data.select_by_label(columns)
 
     @_cudf_nvtx_annotate
@@ -957,6 +970,11 @@ def _init_from_dict_like(
                     data[col_name],
                     nan_as_null=nan_as_null,
                 )
+        self._data._level_names = (
+            tuple(columns.names)
+            if isinstance(columns, pd.Index)
+            else self._data._level_names
+        )
 
     @classmethod
     def _from_data(
@@ -5131,7 +5149,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
 
         index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
         df = cls._from_data(data, index)
-        df._data._level_names = list(dataframe.columns.names)
+        df._data._level_names = tuple(dataframe.columns.names)
 
         # Set columns only if it is a MultiIndex
         if isinstance(dataframe.columns, pd.MultiIndex):
@@ -5377,6 +5395,8 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
             df = df.set_index(index)
         else:
             df._index = as_index(index)
+        if isinstance(columns, pd.Index):
+            df._data._level_names = tuple(columns.names)
         return df
 
     @classmethod
@@ -5434,7 +5454,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
                 data, nan_as_null=nan_as_null
             )
         if isinstance(columns, pd.Index):
-            df._data._level_names = list(columns.names)
+            df._data._level_names = tuple(columns.names)
 
         if index is None:
             df._index = RangeIndex(start=0, stop=len(data))
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index aacf1fa8dae..1008cbdb67f 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2661,7 +2661,9 @@ def _reindex(
             data=cudf.core.column_accessor.ColumnAccessor(
                 cols,
                 multiindex=self._data.multiindex,
-                level_names=self._data.level_names,
+                level_names=tuple(column_names.names)
+                if isinstance(column_names, pd.Index)
+                else None,
             ),
             index=index,
         )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 67b63028fab..c297748f7e5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -6394,6 +6394,7 @@ def test_df_series_dataframe_astype_dtype_dict(copy):
         ([range(100), range(100)], ["range" + str(i) for i in range(100)]),
         (((1, 2, 3), (1, 2, 3)), ["tuple0", "tuple1", "tuple2"]),
         ([[1, 2, 3]], ["list col1", "list col2", "list col3"]),
+        ([[1, 2, 3]], pd.Index(["col1", "col2", "col3"], name="rapids")),
         ([range(100)], ["range" + str(i) for i in range(100)]),
         (((1, 2, 3),), ["k1", "k2", "k3"]),
     ],
@@ -7969,6 +7970,7 @@ def test_series_empty(ps):
 @pytest.mark.parametrize(
     "data",
     [
+        None,
         [],
         [1],
         {"a": [10, 11, 12]},
@@ -7979,7 +7981,10 @@ def test_series_empty(ps):
         },
     ],
 )
-@pytest.mark.parametrize("columns", [["a"], ["another column name"], None])
+@pytest.mark.parametrize(
+    "columns",
+    [["a"], ["another column name"], None, pd.Index(["a"], name="index name")],
+)
 def test_dataframe_init_with_columns(data, columns):
     pdf = pd.DataFrame(data, columns=columns)
     gdf = cudf.DataFrame(data, columns=columns)
@@ -8047,7 +8052,16 @@ def test_dataframe_init_with_columns(data, columns):
     ],
 )
 @pytest.mark.parametrize(
-    "columns", [None, ["0"], [0], ["abc"], [144, 13], [2, 1, 0]]
+    "columns",
+    [
+        None,
+        ["0"],
+        [0],
+        ["abc"],
+        [144, 13],
+        [2, 1, 0],
+        pd.Index(["abc"], name="custom_name"),
+    ],
 )
 def test_dataframe_init_from_series_list(data, ignore_dtype, columns):
     gd_data = [cudf.from_pandas(obj) for obj in data]
@@ -10239,14 +10253,21 @@ def test_dataframe_binop_with_datetime_index():
 
 
 @pytest.mark.parametrize(
-    "columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"])
+    "columns",
+    (
+        [],
+        ["c", "a"],
+        ["a", "d", "b", "e", "c"],
+        ["a", "b", "c"],
+        pd.Index(["b", "a", "c"], name="custom_name"),
+    ),
 )
 @pytest.mark.parametrize("index", (None, [4, 5, 6]))
 def test_dataframe_dict_like_with_columns(columns, index):
     data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
     expect = pd.DataFrame(data, columns=columns, index=index)
     actual = cudf.DataFrame(data, columns=columns, index=index)
-    if index is None and columns == []:
+    if index is None and len(columns) == 0:
         # We make an empty range index, pandas makes an empty index
         expect = expect.reset_index(drop=True)
     assert_eq(expect, actual)

From 2c19bf328ffefb97d17e5ae600197a4ea9ca4445 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 27 Sep 2023 20:37:04 -0700
Subject: [PATCH 225/230] Propagate errors from Parquet reader kernels back to
 host (#14167)

Pass the error code to the host when a kernel detects invalid input.
If multiple errors types are detected, they are combined using a bitwise OR so that caller gets the aggregate error code that includes all types of errors that occurred.

Does not change the kernel side checks.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - https://github.com/nvdbaranec
  - Divye Gala (https://github.com/divyegala)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14167
---
 cpp/src/io/parquet/page_data.cu          | 25 ++++++++---
 cpp/src/io/parquet/page_decode.cuh       | 57 +++++++++++++++++-------
 cpp/src/io/parquet/page_delta_decode.cu  | 25 ++++++++---
 cpp/src/io/parquet/page_string_decode.cu | 25 ++++++++---
 cpp/src/io/parquet/parquet_gpu.hpp       | 21 +++++++++
 cpp/src/io/parquet/reader_impl.cpp       | 19 ++++++--
 6 files changed, 130 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index c26802aa3c2..230834632dd 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -430,10 +430,15 @@ static __device__ void gpuOutputGeneric(
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
+ * @param error_code Error code to set if an error is encountered
  */
 template <int lvl_buf_size, typename level_t>
-__global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+__global__ void __launch_bounds__(decode_block_size)
+  gpuDecodePageData(PageInfo* pages,
+                    device_span<ColumnChunkDesc const> chunks,
+                    size_t min_row,
+                    size_t num_rows,
+                    int32_t* error_code)
 {
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(16)
@@ -472,7 +477,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
-  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
     int target_pos;
     int src_pos = s->src_pos;
 
@@ -596,6 +602,10 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
     }
     __syncthreads();
   }
+  if (t == 0 and s->error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
+  }
 }
 
 struct mask_tform {
@@ -621,6 +631,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                              size_t num_rows,
                              size_t min_row,
                              int level_type_size,
+                             int32_t* error_code,
                              rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -629,11 +640,11 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodePageData<rolling_buf_size, uint8_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodePageData<rolling_buf_size, uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   } else {
-    gpuDecodePageData<rolling_buf_size, uint16_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodePageData<rolling_buf_size, uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   }
 }
 
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 5e66885d746..cdc29197eb3 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -21,6 +21,7 @@
 
 #include <io/utilities/block_utils.cuh>
 
+#include <cuda/atomic>
 #include <cuda/std/tuple>
 
 namespace cudf::io::parquet::gpu {
@@ -69,6 +70,18 @@ struct page_state_s {
   PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info]{};
   // points to either nesting_decode_cache above when possible, or to the global source otherwise
   PageNestingDecodeInfo* nesting_info{};
+
+  inline __device__ void set_error_code(decode_error err) volatile
+  {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
+    ref.fetch_or(static_cast<int32_t>(err), cuda::std::memory_order_relaxed);
+  }
+
+  inline __device__ void reset_error_code() volatile
+  {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
+    ref.store(0, cuda::std::memory_order_release);
+  }
 };
 
 // buffers only used in the decode kernel.  separated from page_state_s to keep
@@ -471,7 +484,7 @@ __device__ void gpuDecodeStream(
   int32_t value_count       = s->lvl_count[lvl];
   int32_t batch_coded_count = 0;
 
-  while (value_count < target_count && value_count < num_input_values) {
+  while (s->error == 0 && value_count < target_count && value_count < num_input_values) {
     int batch_len;
     if (level_run <= 1) {
       // Get a new run symbol from the byte stream
@@ -487,7 +500,14 @@ __device__ void gpuDecodeStream(
             cur++;
           }
         }
-        if (cur > end || level_run <= 1) { s->error = 0x10; }
+        if (cur > end) {
+          s->set_error_code(decode_error::LEVEL_STREAM_OVERRUN);
+          break;
+        }
+        if (level_run <= 1) {
+          s->set_error_code(decode_error::INVALID_LEVEL_RUN);
+          break;
+        }
         sym_len = (int32_t)(cur - cur_def);
         __threadfence_block();
       }
@@ -496,7 +516,7 @@ __device__ void gpuDecodeStream(
       level_run = shuffle(level_run);
       cur_def += sym_len;
     }
-    if (s->error) { break; }
+    if (s->error != 0) { break; }
 
     batch_len = min(num_input_values - value_count, 32);
     if (level_run & 1) {
@@ -852,7 +872,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
 
   constexpr int batch_size = 32;
   int cur_leaf_count       = target_leaf_count;
-  while (!s->error && s->nz_count < target_leaf_count &&
+  while (s->error == 0 && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
     if (has_repetition) {
       gpuDecodeStream<level_t, rolling_buf_size>(rep, s, cur_leaf_count, t, level_type::REPETITION);
@@ -916,7 +936,7 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
     }
     s->lvl_start[lvl] = cur;
 
-    if (cur > end) { s->error = 2; }
+    if (cur > end) { s->set_error_code(decode_error::LEVEL_STREAM_OVERRUN); }
   };
 
   // this is a little redundant. if level_bits == 0, then nothing should be encoded
@@ -941,8 +961,8 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
       // add back the 4 bytes for the length
       len += 4;
     } else {
-      len      = 0;
-      s->error = 2;
+      len = 0;
+      s->set_error_code(decode_error::LEVEL_STREAM_OVERRUN);
     }
   } else if (encoding == Encoding::BIT_PACKED) {
     len                       = (s->page.num_input_values * level_bits + 7) >> 3;
@@ -951,8 +971,8 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
     s->lvl_start[lvl]         = cur;
     s->abs_lvl_start[lvl]     = cur;
   } else {
-    s->error = 3;
-    len      = 0;
+    len = 0;
+    s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
   }
 
   s->abs_lvl_end[lvl] = start + len;
@@ -1094,7 +1114,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   }
 
   if (!t) {
-    s->error = 0;
+    s->reset_error_code();
 
     // IMPORTANT : nested schemas can have 0 rows in a page but still have
     // values. The case is:
@@ -1152,7 +1172,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           break;
         default:  // FIXED_LEN_BYTE_ARRAY:
           s->dtype_len = dtype_len_out;
-          s->error |= (s->dtype_len <= 0);
+          if (s->dtype_len <= 0) { s->set_error_code(decode_error::INVALID_DATA_TYPE); }
           break;
       }
       // Special check for downconversions
@@ -1268,7 +1288,9 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dict_run  = 0;
           s->dict_val  = 0;
           s->dict_bits = (cur < end) ? *cur++ : 0;
-          if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; }
+          if (s->dict_bits > 32 || !s->dict_base) {
+            s->set_error_code(decode_error::INVALID_DICT_WIDTH);
+          }
           break;
         case Encoding::PLAIN:
           s->dict_size = static_cast<int32_t>(end - cur);
@@ -1279,22 +1301,23 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           // first 4 bytes are length of RLE data
           int const len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
           cur += 4;
-          if (cur + len > end) { s->error = 2; }
+          if (cur + len > end) { s->set_error_code(decode_error::DATA_STREAM_OVERRUN); }
           s->dict_run = 0;
         } break;
         case Encoding::DELTA_BINARY_PACKED:
           // nothing to do, just don't error
           break;
-        default:
-          s->error = 1;  // Unsupported encoding
+        default: {
+          s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
           break;
+        }
       }
-      if (cur > end) { s->error = 1; }
+      if (cur > end) { s->set_error_code(decode_error::DATA_STREAM_OVERRUN); }
       s->lvl_end    = cur;
       s->data_start = cur;
       s->data_end   = end;
     } else {
-      s->error = 1;
+      s->set_error_code(decode_error::EMPTY_PAGE);
     }
 
     s->lvl_count[level_type::REPETITION] = 0;
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 35f33a761be..2b78dead205 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -32,8 +32,12 @@ namespace {
 // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
 // this kernel only needs 96 threads (3 warps)(for now).
 template <typename level_t>
-__global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+__global__ void __launch_bounds__(96)
+  gpuDecodeDeltaBinary(PageInfo* pages,
+                       device_span<ColumnChunkDesc const> chunks,
+                       size_t min_row,
+                       size_t num_rows,
+                       int32_t* error_code)
 {
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_binary_decoder db_state;
@@ -79,7 +83,8 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
   // that has a value we need.
   if (skipped_leaf_values > 0) { db->skip_values(skipped_leaf_values); }
 
-  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
     uint32_t target_pos;
     uint32_t const src_pos = s->src_pos;
 
@@ -145,6 +150,11 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
     }
     __syncthreads();
   }
+
+  if (t == 0 and s->error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
+  }
 }
 
 }  // anonymous namespace
@@ -157,6 +167,7 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
                                 size_t num_rows,
                                 size_t min_row,
                                 int level_type_size,
+                                int32_t* error_code,
                                 rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -165,11 +176,11 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodeDeltaBinary<uint8_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodeDeltaBinary<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   } else {
-    gpuDecodeDeltaBinary<uint16_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodeDeltaBinary<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   }
 }
 
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 1ac4c95f713..d79abe4a6d2 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -582,8 +582,12 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+__global__ void __launch_bounds__(decode_block_size)
+  gpuDecodeStringPageData(PageInfo* pages,
+                          device_span<ColumnChunkDesc const> chunks,
+                          size_t min_row,
+                          size_t num_rows,
+                          int32_t* error_code)
 {
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(4) size_type last_offset;
@@ -617,7 +621,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
-  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
     int target_pos;
     int src_pos = s->src_pos;
 
@@ -742,6 +747,11 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
   block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+
+  if (t == 0 and s->error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
+  }
 }
 
 }  // anonymous namespace
@@ -775,6 +785,7 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
                                    size_t num_rows,
                                    size_t min_row,
                                    int level_type_size,
+                                   int32_t* error_code,
                                    rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -783,11 +794,11 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodeStringPageData<uint8_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodeStringPageData<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   } else {
-    gpuDecodeStringPageData<uint16_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodeStringPageData<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a760c2448dc..3c37c0df021 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -54,6 +54,21 @@ constexpr int rolling_index(int index)
   return index % rolling_size;
 }
 
+/**
+ * @brief Enum for the different types of errors that can occur during decoding.
+ *
+ * These values are used as bitmasks, so they must be powers of 2.
+ */
+enum class decode_error : int32_t {
+  DATA_STREAM_OVERRUN  = 0x1,
+  LEVEL_STREAM_OVERRUN = 0x2,
+  UNSUPPORTED_ENCODING = 0x4,
+  INVALID_LEVEL_RUN    = 0x8,
+  INVALID_DATA_TYPE    = 0x10,
+  EMPTY_PAGE           = 0x20,
+  INVALID_DICT_WIDTH   = 0x40,
+};
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -566,6 +581,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
 void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
@@ -573,6 +589,7 @@ void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                     size_t num_rows,
                     size_t min_row,
                     int level_type_size,
+                    int32_t* error_code,
                     rmm::cuda_stream_view stream);
 
 /**
@@ -586,6 +603,7 @@ void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
 void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
@@ -593,6 +611,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                           size_t num_rows,
                           size_t min_row,
                           int level_type_size,
+                          int32_t* error_code,
                           rmm::cuda_stream_view stream);
 
 /**
@@ -606,6 +625,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use, default 0
  */
 void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
@@ -613,6 +633,7 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                        size_t num_rows,
                        size_t min_row,
                        int level_type_size,
+                       int32_t* error_code,
                        rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 8b0a0bd4eb0..6cbe64e227b 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -163,6 +163,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
+  rmm::device_scalar<int32_t> error_code(0, _stream);
+
   // get the number of streams we need from the pool and tell them to wait on the H2D copies
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
@@ -174,17 +176,20 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (has_strings) {
     auto& stream = streams[s_idx++];
     chunk_nested_str_data.host_to_device_async(stream);
-    gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, stream);
+    gpu::DecodeStringPageData(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
   }
 
   // launch delta binary decoder
   if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
+    gpu::DecodeDeltaBinary(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
   if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
+    gpu::DecodePageData(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // synchronize the streams
@@ -193,7 +198,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
-  _stream.synchronize();
+
+  auto const decode_error = error_code.value(_stream);
+  if (decode_error != 0) {
+    std::stringstream stream;
+    stream << std::hex << decode_error;
+    CUDF_FAIL("Parquet data decode failed with code(s) 0x" + stream.str());
+  }
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each

From 53f0f74f6c6d66441225278f19a69885fb8b43c6 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 27 Sep 2023 23:32:46 -0500
Subject: [PATCH 226/230] Support for progressive parquet chunked reading.
 (#14079)

Previously, the parquet chunked reader operated by controlling the size of output chunks only.  It would still ingest the entire input file and decompress it, which can take up a considerable amount of memory.  With this new 'progressive' support, we also 'chunk' at the input level.  Specifically, the user can pass a `pass_read_limit` value which controls how much memory is used for storing compressed/decompressed data.  The reader will make multiple passes over the file, reading as many row groups as it can to attempt to fit within this limit.  Within each pass, chunks are emitted as before.

From the external user's perspective, the chunked read mechanism is the same.  You call `has_next()` and `read_chunk()`.  If the user has specified a value for `pass_read_limit` the set of chunks produced might end up being different (although the concatenation of all of them will still be the same).

The core idea of the code change is to add the idea of the internal `pass`.  Previously we had a `file_intermediate_data` which held data across `read_chunk()` calls.   There is now a `pass_intermediate_data` struct which holds information specific to a given pass.  Many of the invariant things from the file level before (row groups and chunks to process) are now stored in the pass intermediate data.  As we begin each pass, we take the subset of global row groups and chunks that we are going to process for this pass, copy them to out intermediate data, and the remainder of the reader reference this instead of the file-level data.

In order to avoid breaking pre-existing interfaces, there's a new contructor for the `chunked_parquet_reader` class:

```
  chunked_parquet_reader(
    std::size_t chunk_read_limit,
    std::size_t pass_read_limit,
    parquet_reader_options const& options,
    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
```

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14079
---
 cpp/include/cudf/io/detail/parquet.hpp       |  39 ++-
 cpp/include/cudf/io/parquet.hpp              |  24 ++
 cpp/src/io/functions.cpp                     |  17 +
 cpp/src/io/parquet/parquet_gpu.hpp           |  69 +++-
 cpp/src/io/parquet/reader.cpp                |   4 +-
 cpp/src/io/parquet/reader_impl.cpp           | 128 ++++---
 cpp/src/io/parquet/reader_impl.hpp           |  52 ++-
 cpp/src/io/parquet/reader_impl_helpers.cpp   |   4 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  15 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 344 +++++++++++++------
 cpp/tests/io/parquet_chunked_reader_test.cpp |  68 +++-
 11 files changed, 561 insertions(+), 203 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 3f2e1fa5e6c..074f690d2c7 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -91,7 +91,8 @@ class reader {
 class chunked_reader : private reader {
  public:
   /**
-   * @brief Constructor from a read size limit and an array of data sources with reader options.
+   * @brief Constructor from an output size memory limit and an input size memory limit and an array
+   * of data sources with reader options.
    *
    * The typical usage should be similar to this:
    * ```
@@ -102,17 +103,45 @@ class chunked_reader : private reader {
    *
    * ```
    *
-   * If `chunk_read_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the
-   * whole file and return a table containing all rows.
+   * If `chunk_read_limit == 0` (i.e., no output limit), and `pass_read_limit == 0` (no input
+   * temporary memory size limit) a call to `read_chunk()` will read the whole file and return a
+   * table containing all rows.
+   *
+   * The chunk_read_limit parameter controls the size of the output chunks produces.  If the user
+   * specifies 100 MB of data, the reader will attempt to return chunks containing tables that have
+   * a total bytes size (over all columns) of 100 MB or less.  This is a soft limit and the code
+   * will not fail if it cannot satisfy the limit.  It will make a best-effort atttempt only.
+   *
+   * The pass_read_limit parameter controls how much temporary memory is used in the process of
+   * decoding the file.  The primary contributor to this memory usage is the uncompressed size of
+   * the data read out of the file and the decompressed (but not yet decoded) size of the data. The
+   * granularity of a given pass is at the row group level. It will not attempt to read at the sub
+   * row-group level.
+   *
+   * Combined, the way to visualize passes and chunks is as follows:
+   *
+   * @code{.pseudo}
+   * for(each pass){
+   *    for(each output chunk within a pass){
+   *       return a table that fits within the output chunk limit
+   *    }
+   *  }
+   * @endcode
+   *
+   * With a pass_read_limit of `0` you are simply saying you have one pass that reads the entire
+   * file as normal.
    *
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
-   *        or `0` if there is no limit
+   * or `0` if there is no limit
+   * @param pass_read_limit Limit on total amount of memory used for temporary computations during
+   * loading, or `0` if there is no limit
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           parquet_reader_options const& options,
                           rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 788ff15f3c1..deaf23d405a 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -445,6 +445,30 @@ class chunked_parquet_reader {
     parquet_reader_options const& options,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+  /**
+   * @brief Constructor for chunked reader.
+   *
+   * This constructor requires the same `parquet_reader_option` parameter as in
+   * `cudf::read_parquet()`, with additional parameters to specify the size byte limit of the
+   * output table for each reading, and a byte limit on the amount of temporary memory to use
+   * when reading. pass_read_limit affects how many row groups we can read at a time by limiting
+   * the amount of memory dedicated to decompression space. pass_read_limit is a hint, not an
+   * absolute limit - if a single row group cannot fit within the limit given, it will still be
+   * loaded.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per read,
+   * or `0` if there is no limit
+   * @param pass_read_limit Limit on the amount of memory used for reading and decompressing data or
+   * `0` if there is no limit
+   * @param options The options used to read Parquet file
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  chunked_parquet_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    parquet_reader_options const& options,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Destructor, destroying the internal reader instance.
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 45f8b0f8822..392a7850886 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -562,6 +562,23 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
                                                parquet_reader_options const& options,
                                                rmm::mr::device_memory_resource* mr)
   : reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
+                                                            0,
+                                                            make_datasources(options.get_source()),
+                                                            options,
+                                                            cudf::get_default_stream(),
+                                                            mr)}
+{
+}
+
+/**
+ * @copydoc cudf::io::chunked_parquet_reader::chunked_parquet_reader
+ */
+chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
+                                               std::size_t pass_read_limit,
+                                               parquet_reader_options const& options,
+                                               rmm::mr::device_memory_resource* mr)
+  : reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
+                                                            pass_read_limit,
                                                             make_datasources(options.get_source()),
                                                             options,
                                                             cudf::get_default_stream(),
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3c37c0df021..51c862b376b 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -321,33 +321,74 @@ struct ColumnChunkDesc {
 };
 
 /**
- * @brief Struct to store raw/intermediate file data before parsing.
+ * @brief The row_group_info class
+ */
+struct row_group_info {
+  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
+                    // called with index and source_index
+  size_t start_row;
+  size_type source_index;  // file index.
+
+  row_group_info() = default;
+
+  row_group_info(size_type index, size_t start_row, size_type source_index)
+    : index{index}, start_row{start_row}, source_index{source_index}
+  {
+  }
+};
+
+/**
+ * @brief Struct to store file-level data that remains constant for
+ * all passes/chunks for the file.
  */
 struct file_intermediate_data {
+  // all row groups to read
+  std::vector<row_group_info> row_groups{};
+
+  // all chunks from the selected row groups. We may end up reading these chunks progressively
+  // instead of all at once
+  std::vector<gpu::ColumnChunkDesc> chunks{};
+
+  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
+  // may not be visiting every row group that contains these bounds
+  size_t global_skip_rows;
+  size_t global_num_rows;
+};
+
+/**
+ * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
+ */
+struct chunk_read_info {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+/**
+ * @brief Struct to store pass-level data that remains constant for a single pass.
+ */
+struct pass_intermediate_data {
   std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
   rmm::device_buffer decomp_page_data;
+
+  // rowgroup, chunk and page information for the current pass.
+  std::vector<row_group_info> row_groups{};
   cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
   cudf::detail::hostdevice_vector<gpu::PageInfo> pages_info{};
   cudf::detail::hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
   cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
 
-  rmm::device_buffer level_decode_data;
-  int level_type_size;
-};
-
-/**
- * @brief Struct to store intermediate page data for parsing each chunk of rows in chunked reading.
- */
-struct chunk_intermediate_data {
   rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
   rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
-};
 
-/**
- * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
- */
-struct chunk_read_info {
+  std::vector<gpu::chunk_read_info> output_chunk_read_info;
+  std::size_t current_output_chunk{0};
+
+  rmm::device_buffer level_decode_data{};
+  int level_type_size{0};
+
+  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
+  // global values stored in file_intermediate_data.
   size_t skip_rows;
   size_t num_rows;
 };
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 7365c102d8f..1e87447006d 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -43,12 +43,14 @@ table_with_metadata reader::read(parquet_reader_options const& options)
 }
 
 chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                parquet_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
-  _impl = std::make_unique<impl>(chunk_read_limit, std::move(sources), options, stream, mr);
+  _impl = std::make_unique<impl>(
+    chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
 }
 
 chunked_reader::~chunked_reader() = default;
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 6cbe64e227b..ea40f29a070 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -29,10 +29,10 @@ namespace cudf::io::detail::parquet {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
-  auto& chunks              = _file_itm_data.chunks;
-  auto& pages               = _file_itm_data.pages_info;
-  auto& page_nesting        = _file_itm_data.page_nesting_info;
-  auto& page_nesting_decode = _file_itm_data.page_nesting_decode_info;
+  auto& chunks              = _pass_itm_data->chunks;
+  auto& pages               = _pass_itm_data->pages_info;
+  auto& page_nesting        = _pass_itm_data->page_nesting_info;
+  auto& page_nesting_decode = _pass_itm_data->page_nesting_decode_info;
 
   // Should not reach here if there is no page data.
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -55,7 +55,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     gpu::ComputePageStringSizes(
-      pages, chunks, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
+      pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
 
     col_sizes = calculate_page_string_offsets();
 
@@ -169,7 +169,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
-  auto const level_type_size = _file_itm_data.level_type_size;
+  auto const level_type_size = _pass_itm_data->level_type_size;
 
   // launch string decoder
   int s_idx = 0;
@@ -277,6 +277,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
   : impl(0 /*chunk_read_limit*/,
+         0 /*input_pass_read_limit*/,
          std::forward<std::vector<std::unique_ptr<cudf::io::datasource>>>(sources),
          options,
          stream,
@@ -285,11 +286,16 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 }
 
 reader::impl::impl(std::size_t chunk_read_limit,
+                   std::size_t pass_read_limit,
                    std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : _stream{stream}, _mr{mr}, _sources{std::move(sources)}, _chunk_read_limit{chunk_read_limit}
+  : _stream{stream},
+    _mr{mr},
+    _sources{std::move(sources)},
+    _output_chunk_read_limit{chunk_read_limit},
+    _input_pass_read_limit{pass_read_limit}
 {
   // Open and parse the source dataset metadata
   _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
@@ -313,11 +319,8 @@ reader::impl::impl(std::size_t chunk_read_limit,
                               _timestamp_type.id());
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
-  // Don't need to do it if we read the file all at once.
-  if (_chunk_read_limit > 0) {
-    for (auto const& buff : _output_buffers) {
-      _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
-    }
+  for (auto const& buff : _output_buffers) {
+    _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
   }
 }
 
@@ -327,32 +330,62 @@ void reader::impl::prepare_data(int64_t skip_rows,
                                 host_span<std::vector<size_type> const> row_group_indices,
                                 std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
-  if (_file_preprocessed) { return; }
+  // if we have not preprocessed at the whole-file level, do that now
+  if (!_file_preprocessed) {
+    // if filter is not empty, then create output types as vector and pass for filtering.
+    std::vector<data_type> output_types;
+    if (filter.has_value()) {
+      std::transform(_output_buffers.cbegin(),
+                     _output_buffers.cend(),
+                     std::back_inserter(output_types),
+                     [](auto const& col) { return col.type; });
+    }
+    std::tie(
+      _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
+      _metadata->select_row_groups(
+        row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
+
+    if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
+        not _input_columns.empty()) {
+      // fills in chunk information without physically loading or decompressing
+      // the associated data
+      load_global_chunk_info();
+
+      // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now
+      // we will read an entire row group at a time. However, it is possible to do
+      // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and
+      // changed the high level structure such that we weren't always reading an entire table's
+      // worth of columns at once.
+      compute_input_pass_row_group_info();
+    }
 
-  // if filter is not empty, then create output types as vector and pass for filtering.
-  std::vector<data_type> output_types;
-  if (filter.has_value()) {
-    std::transform(_output_buffers.cbegin(),
-                   _output_buffers.cend(),
-                   std::back_inserter(output_types),
-                   [](auto const& col) { return col.type; });
+    _file_preprocessed = true;
   }
-  auto const [skip_rows_corrected, num_rows_corrected, row_groups_info] =
-    _metadata->select_row_groups(
-      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
-
-  if (num_rows_corrected > 0 && not row_groups_info.empty() && not _input_columns.empty()) {
-    load_and_decompress_data(row_groups_info, num_rows_corrected);
-    preprocess_pages(
-      skip_rows_corrected, num_rows_corrected, uses_custom_row_bounds, _chunk_read_limit);
-
-    if (_chunk_read_limit == 0) {  // read the whole file at once
-      CUDF_EXPECTS(_chunk_read_info.size() == 1,
-                   "Reading the whole file should yield only one chunk.");
+
+  // if we have to start a new pass, do that now
+  if (!_pass_preprocessed) {
+    auto const num_passes = _input_pass_row_group_offsets.size() - 1;
+
+    // always create the pass struct, even if we end up with no passes.
+    // this will also cause the previous pass information to be deleted
+    _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+
+    if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
+        not _input_columns.empty() && _current_input_pass < num_passes) {
+      // setup the pass_intermediate_info for this pass.
+      setup_pass();
+
+      load_and_decompress_data();
+      preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit);
+
+      if (_output_chunk_read_limit == 0) {  // read the whole file at once
+        CUDF_EXPECTS(_pass_itm_data->output_chunk_read_info.size() == 1,
+                     "Reading the whole file should yield only one chunk.");
+      }
     }
-  }
 
-  _file_preprocessed = true;
+    _pass_preprocessed = true;
+  }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -382,11 +415,12 @@ table_with_metadata reader::impl::read_chunk_internal(
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
-  if (!has_next() || _chunk_read_info.empty()) {
+  if (!has_next() || _pass_itm_data->output_chunk_read_info.empty()) {
     return finalize_output(out_metadata, out_columns, filter);
   }
 
-  auto const& read_info = _chunk_read_info[_current_read_chunk++];
+  auto const& read_info =
+    _pass_itm_data->output_chunk_read_info[_pass_itm_data->current_output_chunk];
 
   // Allocate memory buffers for the output columns.
   allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds);
@@ -439,6 +473,17 @@ table_with_metadata reader::impl::finalize_output(
     _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
+  // advance chunks/passes as necessary
+  _pass_itm_data->current_output_chunk++;
+  _chunk_count++;
+  if (_pass_itm_data->current_output_chunk >= _pass_itm_data->output_chunk_read_info.size()) {
+    _pass_itm_data->current_output_chunk = 0;
+    _pass_itm_data->output_chunk_read_info.clear();
+
+    _current_input_pass++;
+    _pass_preprocessed = false;
+  }
+
   if (filter.has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
     auto predicate  = cudf::detail::compute_column(
@@ -458,7 +503,8 @@ table_with_metadata reader::impl::read(
   host_span<std::vector<size_type> const> row_group_indices,
   std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
-  CUDF_EXPECTS(_chunk_read_limit == 0, "Reading the whole file must not have non-zero byte_limit.");
+  CUDF_EXPECTS(_output_chunk_read_limit == 0,
+               "Reading the whole file must not have non-zero byte_limit.");
   table_metadata metadata;
   populate_metadata(metadata);
   auto expr_conv     = named_to_reference_converter(filter, metadata);
@@ -472,7 +518,7 @@ table_with_metadata reader::impl::read_chunk()
 {
   // Reset the output buffers to their original states (right after reader construction).
   // Don't need to do it if we read the file all at once.
-  if (_chunk_read_limit > 0) {
+  if (_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
       _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
@@ -494,7 +540,11 @@ bool reader::impl::has_next()
                true /*uses_custom_row_bounds*/,
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
-  return _current_read_chunk < _chunk_read_info.size();
+
+  auto const num_input_passes =
+    _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1;
+  return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) ||
+         (_current_input_pass < num_input_passes);
 }
 
 namespace {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index a980670e465..9445e4d1648 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -90,17 +90,20 @@ class reader::impl {
    * ```
    *
    * Reading the whole given file at once through `read()` function is still supported if
-   * `chunk_read_limit == 0` (i.e., no reading limit).
-   * In such case, `read_chunk()` will also return rows of the entire file.
+   * `chunk_read_limit == 0` (i.e., no reading limit) and `pass_read_limit == 0` (no temporary
+   * memory limit) In such case, `read_chunk()` will also return rows of the entire file.
    *
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
+   * @param pass_read_limit Limit on memory usage for the purposes of decompression and processing
+   * of input, or `0` if there is no limit.
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::size_t chunk_read_limit,
+                std::size_t pass_read_limit,
                 std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
                 rmm::cuda_stream_view stream,
@@ -133,22 +136,22 @@ class reader::impl {
                     host_span<std::vector<size_type> const> row_group_indices,
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
+  void load_global_chunk_info();
+  void compute_input_pass_row_group_info();
+  void setup_pass();
+
   /**
    * @brief Create chunk information and start file reads
    *
-   * @param row_groups_info vector of information about row groups to read
-   * @param num_rows  Maximum number of rows to read
    * @return pair of boolean indicating if compressed chunks were found and a vector of futures for
    * read completion
    */
-  std::pair<bool, std::vector<std::future<void>>> create_and_read_column_chunks(
-    cudf::host_span<row_group_info const> const row_groups_info, size_type num_rows);
+  std::pair<bool, std::vector<std::future<void>>> read_and_decompress_column_chunks();
 
   /**
    * @brief Load and decompress the input file(s) into memory.
    */
-  void load_and_decompress_data(cudf::host_span<row_group_info const> const row_groups_info,
-                                size_type num_rows);
+  void load_and_decompress_data();
 
   /**
    * @brief Perform some preprocessing for page data and also compute the split locations
@@ -161,17 +164,12 @@ class reader::impl {
    *
    * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders).
    *
-   * @param skip_rows Crop all rows below skip_rows
-   * @param num_rows Maximum number of rows to read
    * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
    *        bounds
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
    */
-  void preprocess_pages(size_t skip_rows,
-                        size_t num_rows,
-                        bool uses_custom_row_bounds,
-                        size_t chunk_read_limit);
+  void preprocess_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers to it.
@@ -278,12 +276,28 @@ class reader::impl {
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;
   data_type _timestamp_type{type_id::EMPTY};
 
-  // Variables used for chunked reading:
+  // chunked reading happens in 2 parts:
+  //
+  // At the top level there is the "pass" in which we try and limit the
+  // total amount of temporary memory (compressed data, decompressed data) in use
+  // via _input_pass_read_limit.
+  //
+  // Within a pass, we produce one or more chunks of output, whose maximum total
+  // byte size is controlled by _output_chunk_read_limit.
+
   cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
-  cudf::io::parquet::gpu::chunk_intermediate_data _chunk_itm_data;
-  std::vector<cudf::io::parquet::gpu::chunk_read_info> _chunk_read_info;
-  std::size_t _chunk_read_limit{0};
-  std::size_t _current_read_chunk{0};
+  std::unique_ptr<cudf::io::parquet::gpu::pass_intermediate_data> _pass_itm_data;
+
+  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
+  // the start/end of the chunks to be loaded for a given pass.
+  std::vector<std::size_t> _input_pass_row_group_offsets{};
+  std::vector<std::size_t> _input_pass_row_count{};
+  std::size_t _current_input_pass{0};
+  std::size_t _chunk_count{0};
+
+  std::size_t _output_chunk_read_limit{0};
+  std::size_t _input_pass_read_limit{0};
+  bool _pass_preprocessed{false};
   bool _file_preprocessed{false};
 };
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index f6dbeb275fc..fcaa610fbb7 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -344,7 +344,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<row_group_info>>
+std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -362,7 +362,7 @@ aggregate_reader_metadata::select_row_groups(
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<row_group_info> selection;
+  std::vector<gpu::row_group_info> selection;
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 751ffc33123..61e4f94df0f 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -53,19 +53,6 @@ using namespace cudf::io::parquet;
            : data_type{t_id};
 }
 
-/**
- * @brief The row_group_info class
- */
-struct row_group_info {
-  size_type const index;
-  size_t const start_row;  // TODO source index
-  size_type const source_index;
-  row_group_info(size_type index, size_t start_row, size_type source_index)
-    : index(index), start_row(start_row), source_index(source_index)
-  {
-  }
-};
-
 /**
  * @brief Class for parsing dataset metadata
  */
@@ -194,7 +181,7 @@ class aggregate_reader_metadata {
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>> select_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     int64_t row_start,
     std::optional<size_type> const& row_count,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index a2db0de26bb..c731c467f2c 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -577,10 +577,10 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
 
 void reader::impl::allocate_nesting_info()
 {
-  auto const& chunks             = _file_itm_data.chunks;
-  auto& pages                    = _file_itm_data.pages_info;
-  auto& page_nesting_info        = _file_itm_data.page_nesting_info;
-  auto& page_nesting_decode_info = _file_itm_data.page_nesting_decode_info;
+  auto const& chunks             = _pass_itm_data->chunks;
+  auto& pages                    = _pass_itm_data->pages_info;
+  auto& page_nesting_info        = _pass_itm_data->page_nesting_info;
+  auto& page_nesting_decode_info = _pass_itm_data->page_nesting_decode_info;
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
@@ -702,38 +702,39 @@ void reader::impl::allocate_nesting_info()
 
 void reader::impl::allocate_level_decode_space()
 {
-  auto& pages = _file_itm_data.pages_info;
+  auto& pages = _pass_itm_data->pages_info;
 
   // TODO: this could be made smaller if we ignored dictionary pages and pages with no
   // repetition data.
   size_t const per_page_decode_buf_size =
-    LEVEL_DECODE_BUF_SIZE * 2 * _file_itm_data.level_type_size;
+    LEVEL_DECODE_BUF_SIZE * 2 * _pass_itm_data->level_type_size;
   auto const decode_buf_size = per_page_decode_buf_size * pages.size();
-  _file_itm_data.level_decode_data =
+  _pass_itm_data->level_decode_data =
     rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
 
   // distribute the buffers
-  uint8_t* buf = static_cast<uint8_t*>(_file_itm_data.level_decode_data.data());
+  uint8_t* buf = static_cast<uint8_t*>(_pass_itm_data->level_decode_data.data());
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
     p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
-    buf += (LEVEL_DECODE_BUF_SIZE * _file_itm_data.level_type_size);
+    buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
     p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
-    buf += (LEVEL_DECODE_BUF_SIZE * _file_itm_data.level_type_size);
+    buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
   }
 }
 
-std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_column_chunks(
-  cudf::host_span<row_group_info const> const row_groups_info, size_type num_rows)
+std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompress_column_chunks()
 {
-  auto& raw_page_data = _file_itm_data.raw_page_data;
-  auto& chunks        = _file_itm_data.chunks;
+  auto const& row_groups_info = _pass_itm_data->row_groups;
+  auto const num_rows         = _pass_itm_data->num_rows;
+
+  auto& raw_page_data = _pass_itm_data->raw_page_data;
+  auto& chunks        = _pass_itm_data->chunks;
 
   // Descriptors for all the chunks that make up the selected columns
   auto const num_input_columns = _input_columns.size();
   auto const num_chunks        = row_groups_info.size() * num_input_columns;
-  chunks = cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(0, num_chunks, _stream);
 
   // Association between each column chunk and its source
   std::vector<size_type> chunk_source_map(num_chunks);
@@ -747,13 +748,68 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
   // Initialize column chunk information
   size_t total_decompressed_size = 0;
   auto remaining_rows            = num_rows;
-  std::vector<std::future<void>> read_rowgroup_tasks;
+  std::vector<std::future<void>> read_chunk_tasks;
+  size_type chunk_count = 0;
   for (auto const& rg : row_groups_info) {
     auto const& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
-    auto const row_group_start  = rg.start_row;
     auto const row_group_source = rg.source_index;
     auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
 
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto const& col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+
+      column_chunk_offsets[chunk_count] =
+        (col_meta.dictionary_page_offset != 0)
+          ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset)
+          : col_meta.data_page_offset;
+
+      // Map each column chunk to its column index and its source index
+      chunk_source_map[chunk_count] = row_group_source;
+
+      if (col_meta.codec != Compression::UNCOMPRESSED) {
+        total_decompressed_size += col_meta.total_uncompressed_size;
+      }
+
+      chunk_count++;
+    }
+    remaining_rows -= row_group_rows;
+  }
+
+  // Read compressed chunk data to device memory
+  read_chunk_tasks.push_back(read_column_chunks_async(_sources,
+                                                      raw_page_data,
+                                                      chunks,
+                                                      0,
+                                                      chunks.size(),
+                                                      column_chunk_offsets,
+                                                      chunk_source_map,
+                                                      _stream));
+
+  CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
+
+  return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
+}
+
+void reader::impl::load_global_chunk_info()
+{
+  auto const num_rows         = _file_itm_data.global_num_rows;
+  auto const& row_groups_info = _file_itm_data.row_groups;
+  auto& chunks                = _file_itm_data.chunks;
+
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+
+  // Initialize column chunk information
+  auto remaining_rows = num_rows;
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start = rg.start_row;
+    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
+
     // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
     for (size_t i = 0; i < num_input_columns; ++i) {
       auto col = _input_columns[i];
@@ -768,11 +824,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
                         schema.converted_type,
                         schema.type_length);
 
-      column_chunk_offsets[chunks.size()] =
-        (col_meta.dictionary_page_offset != 0)
-          ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset)
-          : col_meta.data_page_offset;
-
       chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
                                             nullptr,
                                             col_meta.num_values,
@@ -792,92 +843,171 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
                                             clock_rate,
                                             i,
                                             col.schema_idx));
-
-      // Map each column chunk to its column index and its source index
-      chunk_source_map[chunks.size() - 1] = row_group_source;
-
-      if (col_meta.codec != Compression::UNCOMPRESSED) {
-        total_decompressed_size += col_meta.total_uncompressed_size;
-      }
     }
+
     remaining_rows -= row_group_rows;
   }
+}
 
-  // Read compressed chunk data to device memory
-  read_rowgroup_tasks.push_back(read_column_chunks_async(_sources,
-                                                         raw_page_data,
-                                                         chunks,
-                                                         0,
-                                                         chunks.size(),
-                                                         column_chunk_offsets,
-                                                         chunk_source_map,
-                                                         _stream));
+void reader::impl::compute_input_pass_row_group_info()
+{
+  // at this point, row_groups has already been filtered down to just the row groups we need to
+  // handle optional skip_rows/num_rows parameters.
+  auto const& row_groups_info = _file_itm_data.row_groups;
+
+  // if the user hasn't specified an input size limit, read everything in a single pass.
+  if (_input_pass_read_limit == 0) {
+    _input_pass_row_group_offsets.push_back(0);
+    _input_pass_row_group_offsets.push_back(row_groups_info.size());
+    return;
+  }
 
-  CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
+  // generate passes. make sure to account for the case where a single row group doesn't fit within
+  //
+  std::size_t const read_limit =
+    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t cur_pass_byte_size = 0;
+  std::size_t cur_rg_start       = 0;
+  std::size_t cur_row_count      = 0;
+  _input_pass_row_group_offsets.push_back(0);
+  _input_pass_row_count.push_back(0);
+
+  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
+    auto const& rgi       = row_groups_info[cur_rg_index];
+    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+
+    // can we add this row group
+    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+      // A single row group (the current one) is larger than the read limit:
+      // We always need to include at least one row group, so end the pass at the end of the current
+      // row group
+      if (cur_rg_start == cur_rg_index) {
+        _input_pass_row_group_offsets.push_back(cur_rg_index + 1);
+        _input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        cur_rg_start       = cur_rg_index + 1;
+        cur_pass_byte_size = 0;
+      }
+      // End the pass at the end of the previous row group
+      else {
+        _input_pass_row_group_offsets.push_back(cur_rg_index);
+        _input_pass_row_count.push_back(cur_row_count);
+        cur_rg_start       = cur_rg_index;
+        cur_pass_byte_size = row_group.total_byte_size;
+      }
+    } else {
+      cur_pass_byte_size += row_group.total_byte_size;
+    }
+    cur_row_count += row_group.num_rows;
+  }
+  // add the last pass if necessary
+  if (_input_pass_row_group_offsets.back() != row_groups_info.size()) {
+    _input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _input_pass_row_count.push_back(cur_row_count);
+  }
+}
 
-  return {total_decompressed_size > 0, std::move(read_rowgroup_tasks)};
+void reader::impl::setup_pass()
+{
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+
+  // setup row groups to be loaded for this pass
+  auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
+  auto const row_group_end   = _input_pass_row_group_offsets[_current_input_pass + 1];
+  auto const num_row_groups  = row_group_end - row_group_start;
+  _pass_itm_data->row_groups.resize(num_row_groups);
+  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+            _file_itm_data.row_groups.begin() + row_group_end,
+            _pass_itm_data->row_groups.begin());
+
+  auto const num_passes = _input_pass_row_group_offsets.size() - 1;
+  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
+
+  auto const chunks_per_rowgroup = _input_columns.size();
+  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+  _pass_itm_data->chunks =
+    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
+  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
+
+  // adjust skip_rows and num_rows by what's available in the row groups we are processing
+  if (num_passes == 1) {
+    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
+    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
+  } else {
+    auto const global_start_row = _file_itm_data.global_skip_rows;
+    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+    auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row);
+    auto const end_row   = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row);
+
+    // skip_rows is always global in the sense that it is relative to the first row of
+    // everything we will be reading, regardless of what pass we are on.
+    // num_rows is how many rows we are reading this pass.
+    _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass];
+    _pass_itm_data->num_rows  = end_row - start_row;
+  }
 }
 
-void reader::impl::load_and_decompress_data(
-  cudf::host_span<row_group_info const> const row_groups_info, size_type num_rows)
+void reader::impl::load_and_decompress_data()
 {
   // This function should never be called if `num_rows == 0`.
-  CUDF_EXPECTS(num_rows > 0, "Number of reading rows must not be zero.");
+  CUDF_EXPECTS(_pass_itm_data->num_rows > 0, "Number of reading rows must not be zero.");
 
-  auto& raw_page_data    = _file_itm_data.raw_page_data;
-  auto& decomp_page_data = _file_itm_data.decomp_page_data;
-  auto& chunks           = _file_itm_data.chunks;
-  auto& pages            = _file_itm_data.pages_info;
+  auto& raw_page_data    = _pass_itm_data->raw_page_data;
+  auto& decomp_page_data = _pass_itm_data->decomp_page_data;
+  auto& chunks           = _pass_itm_data->chunks;
+  auto& pages            = _pass_itm_data->pages_info;
 
-  auto const [has_compressed_data, read_rowgroup_tasks] =
-    create_and_read_column_chunks(row_groups_info, num_rows);
+  auto const [has_compressed_data, read_chunks_tasks] = read_and_decompress_column_chunks();
 
-  for (auto& task : read_rowgroup_tasks) {
+  for (auto& task : read_chunks_tasks) {
     task.wait();
   }
 
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
+  if (total_pages <= 0) { return; }
   pages = cudf::detail::hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
 
-  if (total_pages > 0) {
-    // decoding of column/page information
-    _file_itm_data.level_type_size = decode_page_headers(chunks, pages, _stream);
-    if (has_compressed_data) {
-      decomp_page_data = decompress_page_data(chunks, pages, _stream);
-      // Free compressed data
-      for (size_t c = 0; c < chunks.size(); c++) {
-        if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
-      }
+  // decoding of column/page information
+  _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
+  if (has_compressed_data) {
+    decomp_page_data = decompress_page_data(chunks, pages, _stream);
+    // Free compressed data
+    for (size_t c = 0; c < chunks.size(); c++) {
+      if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
     }
+  }
 
-    // build output column info
-    // walk the schema, building out_buffers that mirror what our final cudf columns will look
-    // like. important : there is not necessarily a 1:1 mapping between input columns and output
-    // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
-    // columns. The "structiness" is simply implied by the schema.  For example, this schema:
-    //  required group field_id=1 name {
-    //    required binary field_id=2 firstname (String);
-    //    required binary field_id=3 middlename (String);
-    //    required binary field_id=4 lastname (String);
-    // }
-    // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
-    // "name" is a struct column that we want to return, so we have to make sure that we
-    // create it ourselves.
-    // std::vector<output_column_info> output_info = build_output_column_info();
-
-    // the following two allocate functions modify the page data
-    pages.device_to_host_sync(_stream);
-    {
-      // nesting information (sizes, etc) stored -per page-
-      // note : even for flat schemas, we allocate 1 level of "nesting" info
-      allocate_nesting_info();
+  // build output column info
+  // walk the schema, building out_buffers that mirror what our final cudf columns will look
+  // like. important : there is not necessarily a 1:1 mapping between input columns and output
+  // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
+  // columns. The "structiness" is simply implied by the schema.  For example, this schema:
+  //  required group field_id=1 name {
+  //    required binary field_id=2 firstname (String);
+  //    required binary field_id=3 middlename (String);
+  //    required binary field_id=4 lastname (String);
+  // }
+  // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
+  // "name" is a struct column that we want to return, so we have to make sure that we
+  // create it ourselves.
+  // std::vector<output_column_info> output_info = build_output_column_info();
+
+  // the following two allocate functions modify the page data
+  pages.device_to_host_sync(_stream);
+  {
+    // nesting information (sizes, etc) stored -per page-
+    // note : even for flat schemas, we allocate 1 level of "nesting" info
+    allocate_nesting_info();
 
-      // level decode space
-      allocate_level_decode_space();
-    }
-    pages.host_to_device_async(_stream);
+    // level decode space
+    allocate_level_decode_space();
   }
+  pages.host_to_device_async(_stream);
 }
 
 namespace {
@@ -1183,7 +1313,7 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
  */
 std::vector<gpu::chunk_read_info> compute_splits(
   cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-  gpu::chunk_intermediate_data const& id,
+  gpu::pass_intermediate_data const& id,
   size_t num_rows,
   size_t chunk_read_limit,
   rmm::cuda_stream_view stream)
@@ -1539,13 +1669,12 @@ struct page_offset_output_iter {
 
 }  // anonymous namespace
 
-void reader::impl::preprocess_pages(size_t skip_rows,
-                                    size_t num_rows,
-                                    bool uses_custom_row_bounds,
-                                    size_t chunk_read_limit)
+void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
 {
-  auto& chunks = _file_itm_data.chunks;
-  auto& pages  = _file_itm_data.pages_info;
+  auto const skip_rows = _pass_itm_data->skip_rows;
+  auto const num_rows  = _pass_itm_data->num_rows;
+  auto& chunks         = _pass_itm_data->chunks;
+  auto& pages          = _pass_itm_data->pages_info;
 
   // compute page ordering.
   //
@@ -1636,7 +1765,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
 
     // Build index for string dictionaries since they can't be indexed
     // directly due to variable-sized elements
-    _chunk_itm_data.str_dict_index =
+    _pass_itm_data->str_dict_index =
       cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
         total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
 
@@ -1646,7 +1775,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
       CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
                    "Column/page schema index mismatch");
       if (is_dict_chunk(chunks[c])) {
-        chunks[c].str_dict_index = _chunk_itm_data.str_dict_index.data() + str_ofs;
+        chunks[c].str_dict_index = _pass_itm_data->str_dict_index.data() + str_ofs;
         str_ofs += pages[page_count].num_input_values;
       }
 
@@ -1677,7 +1806,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                           std::numeric_limits<size_t>::max(),
                           true,                  // compute num_rows
                           chunk_read_limit > 0,  // compute string sizes
-                          _file_itm_data.level_type_size,
+                          _pass_itm_data->level_type_size,
                           _stream);
 
     // computes:
@@ -1699,20 +1828,21 @@ void reader::impl::preprocess_pages(size_t skip_rows,
   }
 
   // preserve page ordering data for string decoder
-  _chunk_itm_data.page_keys  = std::move(page_keys);
-  _chunk_itm_data.page_index = std::move(page_index);
+  _pass_itm_data->page_keys  = std::move(page_keys);
+  _pass_itm_data->page_index = std::move(page_index);
 
   // compute splits if necessary. otherwise return a single split representing
   // the whole file.
-  _chunk_read_info = chunk_read_limit > 0
-                       ? compute_splits(pages, _chunk_itm_data, num_rows, chunk_read_limit, _stream)
-                       : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+  _pass_itm_data->output_chunk_read_info =
+    _output_chunk_read_limit > 0
+      ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
+      : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
 {
-  auto const& chunks = _file_itm_data.chunks;
-  auto& pages        = _file_itm_data.pages_info;
+  auto const& chunks = _pass_itm_data->chunks;
+  auto& pages        = _pass_itm_data->pages_info;
 
   // Should not reach here if there is no page data.
   CUDF_EXPECTS(pages.size() > 0, "There is no page to parse");
@@ -1729,7 +1859,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
                           num_rows,
                           false,  // num_rows is already computed
                           false,  // no need to compute string sizes
-                          _file_itm_data.level_type_size,
+                          _pass_itm_data->level_type_size,
                           _stream);
 
     // print_pages(pages, _stream);
@@ -1766,7 +1896,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    auto& page_index = _chunk_itm_data.page_index;
+    auto& page_index = _pass_itm_data->page_index;
 
     std::vector<input_col_info> h_cols_info;
     h_cols_info.reserve(_input_columns.size());
@@ -1846,10 +1976,10 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
 std::vector<size_t> reader::impl::calculate_page_string_offsets()
 {
-  auto& chunks           = _file_itm_data.chunks;
-  auto& pages            = _file_itm_data.pages_info;
-  auto const& page_keys  = _chunk_itm_data.page_keys;
-  auto const& page_index = _chunk_itm_data.page_index;
+  auto& chunks           = _pass_itm_data->chunks;
+  auto& pages            = _pass_itm_data->pages_info;
+  auto const& page_keys  = _pass_itm_data->page_keys;
+  auto const& page_index = _pass_itm_data->page_index;
 
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cpp b/cpp/tests/io/parquet_chunked_reader_test.cpp
index 9815304b965..05fb9a3ec48 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cpp
+++ b/cpp/tests/io/parquet_chunked_reader_test.cpp
@@ -100,11 +100,13 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
   return std::pair{std::move(input_table), std::move(filepath)};
 }
 
-auto chunked_read(std::string const& filepath, std::size_t byte_limit)
+auto chunked_read(std::string const& filepath,
+                  std::size_t output_limit,
+                  std::size_t input_limit = 0)
 {
   auto const read_opts =
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build();
-  auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts);
+  auto reader = cudf::io::chunked_parquet_reader(output_limit, input_limit, read_opts);
 
   auto num_chunks = 0;
   auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
@@ -950,3 +952,65 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
     EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), page_limit_rows / 4);
   } while (reader.has_next());
 }
+
+TEST_F(ParquetChunkedReaderTest, InputLimitSimple)
+{
+  auto const filepath = temp_env->get_temp_filepath("input_limit_10_rowgroups.parquet");
+
+  // This results in 10 grow groups, at 4001150 bytes per row group
+  constexpr int num_rows = 25'000'000;
+  auto value_iter = cudf::detail::make_counting_transform_iterator(0, [](int i) { return i; });
+  cudf::test::fixed_width_column_wrapper<int> expected(value_iter, value_iter + num_rows);
+  cudf::io::parquet_writer_options opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath},
+                                              cudf::table_view{{expected}})
+      // note: it is unnecessary to force compression to NONE here because the size we are using in
+      // the row group is the uncompressed data size. But forcing the dictionary policy to
+      // dictionary_policy::NEVER is necessary to prevent changes in the
+      // decompressed-but-not-yet-decoded data.
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+
+  cudf::io::write_parquet(opts);
+
+  {
+    // no chunking
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 1);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 4000000);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 4100000);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 12 chunks of 200k rows each, plus 1 final chunk of 100k rows.
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 8002301);
+    EXPECT_EQ(num_chunks, 13);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 1 big chunk
+    auto const [result, num_chunks] = chunked_read(filepath, 0, size_t{1} * 1024 * 1024 * 1024);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+}

From b2f00809f40e2e81b01214177b412456d40404cc Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 28 Sep 2023 12:16:29 -0500
Subject: [PATCH 227/230] Pin dask and distributed for 23.10 release (#14225)

This PR pins `dask` and `distributed` to `2023.9.2` for `23.10` release.

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
   - Peter Andreas Entschev (https://github.com/pentschev)
---
 ci/test_wheel_dask_cudf.sh                       |  2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml |  6 +++---
 conda/environments/all_cuda-120_arch-x86_64.yaml |  6 +++---
 conda/recipes/custreamz/meta.yaml                |  6 +++---
 conda/recipes/dask-cudf/meta.yaml                | 12 ++++++------
 conda/recipes/dask-cudf/run_test.sh              |  4 ++--
 dependencies.yaml                                |  6 +++---
 python/dask_cudf/pyproject.toml                  |  4 ++--
 8 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index d6e7f4bf65e..0abee09ca8a 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -11,7 +11,7 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 9fb991f9075..46b0b3799f2 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -25,10 +25,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 9ba0dd8dc38..0e137c91120 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -26,10 +26,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 7aaa40bffd0..233d51baf31 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,9 +45,9 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
-    - distributed >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
+    - distributed ==2023.9.2
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 12809ba648f..4c8af071074 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -38,16 +38,16 @@ requirements:
   host:
     - python
     - cudf ={{ version }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
-    - distributed >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
+    - distributed ==2023.9.2
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
-    - distributed >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
+    - distributed ==2023.9.2
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index 7dc54747a0c..c79c014a89a 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -18,10 +18,10 @@ if [ "${ARCH}" = "aarch64" ]; then
 fi
 
 # Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=1
+export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2023.7.1"
+export DASK_STABLE_VERSION="2023.9.2"
 
 # Install the conda-forge or nightly version of dask and distributed
 if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
diff --git a/dependencies.yaml b/dependencies.yaml
index 5586f54348c..b21472df4fd 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -491,12 +491,12 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask>=2023.7.1
-          - distributed>=2023.7.1
+          - dask==2023.9.2
+          - distributed==2023.9.2
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core>=2023.7.1  # dask-core in conda is the actual package & dask is the meta package
+          - dask-core==2023.9.2  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
           - &cudf cudf==23.10.*
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 922da366422..41b57b71749 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -20,8 +20,8 @@ requires-python = ">=3.9"
 dependencies = [
     "cudf==23.10.*",
     "cupy-cuda11x>=12.0.0",
-    "dask>=2023.7.1",
-    "distributed>=2023.7.1",
+    "dask==2023.9.2",
+    "distributed==2023.9.2",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",

From 66a655ce80e8b0accb80ea4e23799d23a82a35a2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 3 Oct 2023 08:00:44 -0500
Subject: [PATCH 228/230] Fix inaccuracy in decimal128 rounding. (#14233)

Fixes a bug where floating-point values were used in decimal128 rounding, giving wrong results.

Closes https://github.com/rapidsai/cudf/issues/14210.

Authors:
   - Bradley Dice (https://github.com/bdice)

Approvers:
   - Divye Gala (https://github.com/divyegala)
   - Mark Harris (https://github.com/harrism)
---
 cpp/src/round/round.cu          |  5 ++-
 cpp/tests/round/round_tests.cpp | 79 +++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 4b3f80fc6e2..41cce57d55b 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -271,7 +271,10 @@ std::unique_ptr<column> round_with(column_view const& input,
                                out_view.template end<Type>(),
                                static_cast<Type>(0));
   } else {
-    Type const n = std::pow(10, scale_movement);
+    Type n = 10;
+    for (int i = 1; i < scale_movement; ++i) {
+      n *= 10;
+    }
     thrust::transform(rmm::exec_policy(stream),
                       input.begin<Type>(),
                       input.end<Type>(),
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index d802c0c2706..f97bb7a5323 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -703,4 +703,83 @@ TEST_F(RoundTests, BoolTestHalfUp)
   EXPECT_THROW(cudf::round(input, -2, cudf::rounding_method::HALF_UP), cudf::logic_error);
 }
 
+// Use __uint128_t for demonstration.
+constexpr __uint128_t operator""_uint128_t(const char* s)
+{
+  __uint128_t ret = 0;
+  for (int i = 0; s[i] != '\0'; ++i) {
+    ret *= 10;
+    if ('0' <= s[i] && s[i] <= '9') { ret += s[i] - '0'; }
+  }
+  return ret;
+}
+
+TEST_F(RoundTests, HalfEvenErrorsA)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.5 at scale -37 should round HALF_EVEN to 0, because 0 is an even number
+    auto const input =
+      fp_wrapper{{5000000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{0}, scale_type{0}};
+    auto const result   = cudf::round(input, 0, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfEvenErrorsB)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.125 at scale -37 should round HALF_EVEN to 0.12, because 2 is an even number
+    auto const input =
+      fp_wrapper{{1250000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{12}, scale_type{-2}};
+    auto const result   = cudf::round(input, 2, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfEvenErrorsC)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.0625 at scale -37 should round HALF_EVEN to 0.062, because 2 is an even number
+    auto const input =
+      fp_wrapper{{0625000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{62}, scale_type{-3}};
+    auto const result   = cudf::round(input, 3, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfUpErrorsA)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.25 at scale -37 should round HALF_UP to 0.3
+    auto const input =
+      fp_wrapper{{2500000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{3}, scale_type{-1}};
+    auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 3964950ba2fecf7f962917276058a6381d396246 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 3 Oct 2023 15:11:15 -0500
Subject: [PATCH 229/230] Fix inaccurate ceil/floor and inaccurate rescaling
 casts of fixed-point values. (#14242)

This is a follow-up PR to #14233. This PR fixes a bug where floating-point values were used as intermediates in ceil/floor unary operations and cast operations that require rescaling for fixed-point types, giving inaccurate results.

See also:
- https://github.com/rapidsai/cudf/pull/14233#discussion_r1340786769
- https://github.com/rapidsai/cudf/issues/14243

Authors:
   - Bradley Dice (https://github.com/bdice)

Approvers:
   - Mike Wilson (https://github.com/hyperbolic2346)
   - Vukasin Milovanovic (https://github.com/vuule)
---
 cpp/src/unary/cast_ops.cu          |  8 +++++-
 cpp/src/unary/math_ops.cu          |  8 ++++--
 cpp/tests/unary/cast_tests.cpp     | 40 ++++++++++++++++++++++++++++++
 cpp/tests/unary/unary_ops_test.cpp | 33 ++++++++++++++++++++++++
 4 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index f40ace8d10b..1c81f266200 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -199,7 +199,13 @@ std::unique_ptr<column> rescale(column_view input,
       }
       return output_column;
     }
-    auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff}, stream);
+
+    RepType scalar_value = 10;
+    for (int i = 1; i < -diff; ++i) {
+      scalar_value *= 10;
+    }
+
+    auto const scalar = make_fixed_point_scalar<T>(scalar_value, scale_type{diff}, stream);
     return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
   }
 };
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 961f3a9e720..d0cae81a9c8 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -295,7 +295,11 @@ std::unique_ptr<column> unary_op_with(column_view const& input,
     input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
 
   auto out_view = result->mutable_view();
-  Type const n  = std::pow(10, -input.type().scale());
+
+  Type n = 10;
+  for (int i = 1; i < -input.type().scale(); ++i) {
+    n *= 10;
+  }
 
   thrust::transform(rmm::exec_policy(stream),
                     input.begin<Type>(),
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 9506e1918c0..d565359a4ea 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -30,6 +30,8 @@
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cuda/std/limits>
+
 #include <type_traits>
 #include <vector>
 
@@ -967,6 +969,44 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScale)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointTests, ValidateCastRescalePrecision)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  // This test is designed to protect against floating point conversion
+  // introducing errors in fixed-point arithmetic. The rescaling that occurs
+  // during casting to different scales should only use fixed-precision math.
+  // Realistically, we are only able to show precision failures due to floating
+  // conversion in a few very specific circumstances where dividing by specific
+  // powers of 10 works against us.  Some examples: 10^23, 10^25, 10^26, 10^27,
+  // 10^30, 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation.
+  // For completeness and to ensure that we are not missing any other cases, we
+  // test casting to/from all scales in the range of each decimal type. Values
+  // that are powers of ten show this error more readily than non-powers of 10
+  // because the rescaling factor is a power of 10, meaning that errors in
+  // division are more visible.
+  constexpr auto min_scale = -cuda::std::numeric_limits<RepType>::digits10;
+  for (int input_scale = 0; input_scale >= min_scale; --input_scale) {
+    for (int result_scale = 0; result_scale >= min_scale; --result_scale) {
+      RepType input_value = 1;
+      for (int k = 0; k > input_scale; --k) {
+        input_value *= 10;
+      }
+      RepType result_value = 1;
+      for (int k = 0; k > result_scale; --k) {
+        result_value *= 10;
+      }
+      auto const input    = fp_wrapper{{input_value}, scale_type{input_scale}};
+      auto const expected = fp_wrapper{{result_value}, scale_type{result_scale}};
+      auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(result_scale));
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+    }
+  }
+}
+
 TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask)
 {
   using namespace numeric;
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index 49764f22373..76d1f769856 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -24,6 +24,8 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cuda/std/limits>
+
 template <typename T>
 cudf::test::fixed_width_column_wrapper<T> create_fixed_columns(cudf::size_type start,
                                                                cudf::size_type size,
@@ -372,4 +374,35 @@ TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryFloorLarge)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointUnaryTests, ValidateCeilFloorPrecision)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  // This test is designed to protect against floating point conversion
+  // introducing errors in fixed-point arithmetic. The rounding that occurs
+  // during ceil/floor should only use fixed-precision math. Realistically,
+  // we are only able to show precision failures due to floating conversion in
+  // a few very specific circumstances where dividing by specific powers of 10
+  // works against us.  Some examples: 10^23, 10^25, 10^26, 10^27, 10^30,
+  // 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation. For
+  // completeness and to ensure that we are not missing any other cases, we
+  // test all scales representable in the range of each decimal type.
+  constexpr auto min_scale = -cuda::std::numeric_limits<RepType>::digits10;
+  for (int input_scale = 0; input_scale >= min_scale; --input_scale) {
+    RepType input_value = 1;
+    for (int k = 0; k > input_scale; --k) {
+      input_value *= 10;
+    }
+    auto const input       = fp_wrapper{{input_value}, scale_type{input_scale}};
+    auto const expected    = fp_wrapper{{input_value}, scale_type{input_scale}};
+    auto const ceil_result = cudf::unary_operation(input, cudf::unary_operator::CEIL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, ceil_result->view());
+    auto const floor_result = cudf::unary_operation(input, cudf::unary_operator::FLOOR);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, floor_result->view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 135879368a8fcecda0a1d85bcf18b7e15cd0269d Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 11 Oct 2023 10:28:59 -0400
Subject: [PATCH 230/230] Update Changelog [skip ci]

---
 CHANGELOG.md | 262 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 262 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76abf241d96..ecd547ab5b3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,265 @@
+# cuDF 23.10.00 (11 Oct 2023)
+
+## 🚨 Breaking Changes
+
+- Expose stream parameter in public nvtext ngram APIs ([#14061](https://github.com/rapidsai/cudf/pull/14061)) [@davidwendt](https://github.com/davidwendt)
+- Raise `MixedTypeError` when a column of mixed-dtype is being constructed ([#14050](https://github.com/rapidsai/cudf/pull/14050)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise `NotImplementedError` for `MultiIndex.to_series` ([#14049](https://github.com/rapidsai/cudf/pull/14049)) [@galipremsagar](https://github.com/galipremsagar)
+- Create table_input_metadata from a table_metadata ([#13920](https://github.com/rapidsai/cudf/pull/13920)) [@etseidl](https://github.com/etseidl)
+- Enable RLE boolean encoding for v2 Parquet files ([#13886](https://github.com/rapidsai/cudf/pull/13886)) [@etseidl](https://github.com/etseidl)
+- Change `NA` to `NaT` for `datetime` and `timedelta` types ([#13868](https://github.com/rapidsai/cudf/pull/13868)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix `any`, `all` reduction behavior for `axis=None` and warn for other reductions ([#13831](https://github.com/rapidsai/cudf/pull/13831)) [@galipremsagar](https://github.com/galipremsagar)
+- Add minhash support for MurmurHash3_x64_128 ([#13796](https://github.com/rapidsai/cudf/pull/13796)) [@davidwendt](https://github.com/davidwendt)
+- Remove the libcudf cudf::offset_type type ([#13788](https://github.com/rapidsai/cudf/pull/13788)) [@davidwendt](https://github.com/davidwendt)
+- Raise error when trying to join `datetime` and `timedelta` types with other types ([#13786](https://github.com/rapidsai/cudf/pull/13786)) [@galipremsagar](https://github.com/galipremsagar)
+- Update to Cython 3.0.0 ([#13777](https://github.com/rapidsai/cudf/pull/13777)) [@vyasr](https://github.com/vyasr)
+- Raise error on constructing an array from mixed type inputs ([#13768](https://github.com/rapidsai/cudf/pull/13768)) [@galipremsagar](https://github.com/galipremsagar)
+- Enforce deprecations in `23.10` ([#13732](https://github.com/rapidsai/cudf/pull/13732)) [@galipremsagar](https://github.com/galipremsagar)
+- Upgrade to arrow 12 ([#13728](https://github.com/rapidsai/cudf/pull/13728)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove Arrow dependency from the `datasource.hpp` public header ([#13698](https://github.com/rapidsai/cudf/pull/13698)) [@vuule](https://github.com/vuule)
+
+## 🐛 Bug Fixes
+
+- Fix inaccurate ceil/floor and inaccurate rescaling casts of fixed-point values. ([#14242](https://github.com/rapidsai/cudf/pull/14242)) [@bdice](https://github.com/bdice)
+- Fix inaccuracy in decimal128 rounding. ([#14233](https://github.com/rapidsai/cudf/pull/14233)) [@bdice](https://github.com/bdice)
+- Workaround for illegal instruction error in sm90 for warp instrinsics with mask ([#14201](https://github.com/rapidsai/cudf/pull/14201)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix pytorch related pytest ([#14198](https://github.com/rapidsai/cudf/pull/14198)) [@galipremsagar](https://github.com/galipremsagar)
+- Pin to `aws-sdk-cpp&lt;1.11` ([#14173](https://github.com/rapidsai/cudf/pull/14173)) [@pentschev](https://github.com/pentschev)
+- Fix assert failure for range window functions ([#14168](https://github.com/rapidsai/cudf/pull/14168)) [@mythrocks](https://github.com/mythrocks)
+- Fix Memcheck error found in JSON_TEST JsonReaderTest.ErrorStrings ([#14164](https://github.com/rapidsai/cudf/pull/14164)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix calls to copy_bitmask to pass stream parameter ([#14158](https://github.com/rapidsai/cudf/pull/14158)) [@davidwendt](https://github.com/davidwendt)
+- Fix DataFrame from Series with different CategoricalIndexes ([#14157](https://github.com/rapidsai/cudf/pull/14157)) [@mroeschke](https://github.com/mroeschke)
+- Pin to numpy&lt;1.25 and numba&lt;0.58 to avoid errors and deprecation warnings-as-errors. ([#14156](https://github.com/rapidsai/cudf/pull/14156)) [@bdice](https://github.com/bdice)
+- Fix kernel launch error for cudf::io::orc::gpu::rowgroup_char_counts_kernel ([#14139](https://github.com/rapidsai/cudf/pull/14139)) [@davidwendt](https://github.com/davidwendt)
+- Don&#39;t sort columns for DataFrame init from list of Series ([#14136](https://github.com/rapidsai/cudf/pull/14136)) [@mroeschke](https://github.com/mroeschke)
+- Fix DataFrame.values with no columns but index ([#14134](https://github.com/rapidsai/cudf/pull/14134)) [@mroeschke](https://github.com/mroeschke)
+- Avoid circular cimports in _lib/cpp/reduce.pxd ([#14125](https://github.com/rapidsai/cudf/pull/14125)) [@vyasr](https://github.com/vyasr)
+- Add support for nested dict in `DataFrame` constructor ([#14119](https://github.com/rapidsai/cudf/pull/14119)) [@galipremsagar](https://github.com/galipremsagar)
+- Restrict iterables of `DataFrame`&#39;s as input to `DataFrame` constructor ([#14118](https://github.com/rapidsai/cudf/pull/14118)) [@galipremsagar](https://github.com/galipremsagar)
+- Allow `numeric_only=True` for reduction operations on numeric types ([#14111](https://github.com/rapidsai/cudf/pull/14111)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve name of the column while initializing a `DataFrame` ([#14110](https://github.com/rapidsai/cudf/pull/14110)) [@galipremsagar](https://github.com/galipremsagar)
+- Correct numerous 20054-D: dynamic initialization errors found on arm+12.2 ([#14108](https://github.com/rapidsai/cudf/pull/14108)) [@robertmaynard](https://github.com/robertmaynard)
+- Drop `kwargs` from `Series.count` ([#14106](https://github.com/rapidsai/cudf/pull/14106)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix naming issues with `Index.to_frame` and `MultiIndex.to_frame` APIs ([#14105](https://github.com/rapidsai/cudf/pull/14105)) [@galipremsagar](https://github.com/galipremsagar)
+- Only use memory resources that haven&#39;t been freed ([#14103](https://github.com/rapidsai/cudf/pull/14103)) [@robertmaynard](https://github.com/robertmaynard)
+- Add support for `__round__` in `Series` and `DataFrame` ([#14099](https://github.com/rapidsai/cudf/pull/14099)) [@galipremsagar](https://github.com/galipremsagar)
+- Validate ignore_index type in drop_duplicates ([#14098](https://github.com/rapidsai/cudf/pull/14098)) [@mroeschke](https://github.com/mroeschke)
+- Fix renaming `Series` and `Index` ([#14080](https://github.com/rapidsai/cudf/pull/14080)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise NotImplementedError in to_datetime if Z (or tz component) in string ([#14074](https://github.com/rapidsai/cudf/pull/14074)) [@mroeschke](https://github.com/mroeschke)
+- Raise NotImplementedError for datetime strings with UTC offset ([#14070](https://github.com/rapidsai/cudf/pull/14070)) [@mroeschke](https://github.com/mroeschke)
+- Update pyarrow-related dispatch logic in dask_cudf ([#14069](https://github.com/rapidsai/cudf/pull/14069)) [@rjzamora](https://github.com/rjzamora)
+- Use `conda mambabuild` rather than `mamba mambabuild` ([#14067](https://github.com/rapidsai/cudf/pull/14067)) [@wence-](https://github.com/wence-)
+- Raise NotImplementedError in to_datetime with dayfirst without infer_format ([#14058](https://github.com/rapidsai/cudf/pull/14058)) [@mroeschke](https://github.com/mroeschke)
+- Fix various issues in `Index.intersection` ([#14054](https://github.com/rapidsai/cudf/pull/14054)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix `Index.difference` to match with pandas ([#14053](https://github.com/rapidsai/cudf/pull/14053)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix empty string column construction ([#14052](https://github.com/rapidsai/cudf/pull/14052)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix `IntervalIndex.union` to preserve type-metadata ([#14051](https://github.com/rapidsai/cudf/pull/14051)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise `MixedTypeError` when a column of mixed-dtype is being constructed ([#14050](https://github.com/rapidsai/cudf/pull/14050)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise `NotImplementedError` for `MultiIndex.to_series` ([#14049](https://github.com/rapidsai/cudf/pull/14049)) [@galipremsagar](https://github.com/galipremsagar)
+- Ignore compile_commands.json ([#14048](https://github.com/rapidsai/cudf/pull/14048)) [@harrism](https://github.com/harrism)
+- Raise TypeError for any non-parseable argument in to_datetime ([#14044](https://github.com/rapidsai/cudf/pull/14044)) [@mroeschke](https://github.com/mroeschke)
+- Raise NotImplementedError for to_datetime with z format ([#14037](https://github.com/rapidsai/cudf/pull/14037)) [@mroeschke](https://github.com/mroeschke)
+- Implement `sort_remaining` for `sort_index` ([#14033](https://github.com/rapidsai/cudf/pull/14033)) [@wence-](https://github.com/wence-)
+- Raise NotImplementedError for Categoricals with timezones ([#14032](https://github.com/rapidsai/cudf/pull/14032)) [@mroeschke](https://github.com/mroeschke)
+- Temporary fix Parquet metadata with empty value string being ignored from writing ([#14026](https://github.com/rapidsai/cudf/pull/14026)) [@ttnghia](https://github.com/ttnghia)
+- Preserve types of scalar being returned when possible in `quantile` ([#14014](https://github.com/rapidsai/cudf/pull/14014)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix return type of `MultiIndex.difference` ([#14009](https://github.com/rapidsai/cudf/pull/14009)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise an error when timezone subtypes are encountered in `pd.IntervalDtype` ([#14006](https://github.com/rapidsai/cudf/pull/14006)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix map column can not be non-nullable for java ([#14003](https://github.com/rapidsai/cudf/pull/14003)) [@res-life](https://github.com/res-life)
+- Fix `name` selection in `Index.difference` and `Index.intersection` ([#13986](https://github.com/rapidsai/cudf/pull/13986)) [@galipremsagar](https://github.com/galipremsagar)
+- Restore column type metadata with `dropna` to fix `factorize` API ([#13980](https://github.com/rapidsai/cudf/pull/13980)) [@galipremsagar](https://github.com/galipremsagar)
+- Use thread_index_type to avoid out of bounds accesses in conditional joins ([#13971](https://github.com/rapidsai/cudf/pull/13971)) [@vyasr](https://github.com/vyasr)
+- Fix `MultiIndex.to_numpy` to return numpy array with tuples ([#13966](https://github.com/rapidsai/cudf/pull/13966)) [@galipremsagar](https://github.com/galipremsagar)
+- Use cudf::thread_index_type in get_json_object and tdigest kernels ([#13962](https://github.com/rapidsai/cudf/pull/13962)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix an issue with `IntervalIndex.repr` when null values are present ([#13958](https://github.com/rapidsai/cudf/pull/13958)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix type metadata issue preservation with `Column.unique` ([#13957](https://github.com/rapidsai/cudf/pull/13957)) [@galipremsagar](https://github.com/galipremsagar)
+- Handle `Interval` scalars when passed in list-like inputs to `cudf.Index` ([#13956](https://github.com/rapidsai/cudf/pull/13956)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix setting of categories order when `dtype` is passed to a `CategoricalColumn` ([#13955](https://github.com/rapidsai/cudf/pull/13955)) [@galipremsagar](https://github.com/galipremsagar)
+- Handle `as_index` in `GroupBy.apply` ([#13951](https://github.com/rapidsai/cudf/pull/13951)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Raise error for string types in `nsmallest` and `nlargest` ([#13946](https://github.com/rapidsai/cudf/pull/13946)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix `index` of `Groupby.apply` results when it is performed on empty objects ([#13944](https://github.com/rapidsai/cudf/pull/13944)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix integer overflow in shim `device_sum` functions ([#13943](https://github.com/rapidsai/cudf/pull/13943)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix type mismatch in groupby reduction for empty objects ([#13942](https://github.com/rapidsai/cudf/pull/13942)) [@galipremsagar](https://github.com/galipremsagar)
+- Fixed processed bytes calculation in APPLY_BOOLEAN_MASK benchmark. ([#13937](https://github.com/rapidsai/cudf/pull/13937)) [@Blonck](https://github.com/Blonck)
+- Fix construction of `Grouping` objects ([#13932](https://github.com/rapidsai/cudf/pull/13932)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix an issue with `loc` when column names is `MultiIndex` ([#13929](https://github.com/rapidsai/cudf/pull/13929)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix handling of typecasting in `searchsorted` ([#13925](https://github.com/rapidsai/cudf/pull/13925)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve index `name` in `reindex` ([#13917](https://github.com/rapidsai/cudf/pull/13917)) [@galipremsagar](https://github.com/galipremsagar)
+- Use `cudf::thread_index_type` in cuIO to prevent overflow in row indexing ([#13910](https://github.com/rapidsai/cudf/pull/13910)) [@vuule](https://github.com/vuule)
+- Fix for encodings listed in the Parquet column chunk metadata ([#13907](https://github.com/rapidsai/cudf/pull/13907)) [@etseidl](https://github.com/etseidl)
+- Use cudf::thread_index_type in concatenate.cu. ([#13906](https://github.com/rapidsai/cudf/pull/13906)) [@bdice](https://github.com/bdice)
+- Use cudf::thread_index_type in replace.cu. ([#13905](https://github.com/rapidsai/cudf/pull/13905)) [@bdice](https://github.com/bdice)
+- Add noSanitizer tag to Java reduction tests failing with sanitizer in CUDA 12 ([#13904](https://github.com/rapidsai/cudf/pull/13904)) [@jlowe](https://github.com/jlowe)
+- Remove the internal use of the cudf&#39;s default stream in cuIO ([#13903](https://github.com/rapidsai/cudf/pull/13903)) [@vuule](https://github.com/vuule)
+- Use cuda-nvtx-dev CUDA 12 package. ([#13901](https://github.com/rapidsai/cudf/pull/13901)) [@bdice](https://github.com/bdice)
+- Use `thread_index_type` to avoid index overflow in grid-stride loops ([#13895](https://github.com/rapidsai/cudf/pull/13895)) [@PointKernel](https://github.com/PointKernel)
+- Fix memory access error in cudf::shift for sliced strings ([#13894](https://github.com/rapidsai/cudf/pull/13894)) [@davidwendt](https://github.com/davidwendt)
+- Raise error when trying to construct a `DataFrame` with mixed types ([#13889](https://github.com/rapidsai/cudf/pull/13889)) [@galipremsagar](https://github.com/galipremsagar)
+- Return `nan` when one variable to be correlated has zero variance in JIT GroupBy Apply ([#13884](https://github.com/rapidsai/cudf/pull/13884)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Correctly detect the BOM mark in `read_csv` with compressed input ([#13881](https://github.com/rapidsai/cudf/pull/13881)) [@vuule](https://github.com/vuule)
+- Check for the presence of all values in `MultiIndex.isin` ([#13879](https://github.com/rapidsai/cudf/pull/13879)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix nvtext::generate_character_ngrams performance regression for longer strings ([#13874](https://github.com/rapidsai/cudf/pull/13874)) [@davidwendt](https://github.com/davidwendt)
+- Fix return type of `MultiIndex.levels` ([#13870](https://github.com/rapidsai/cudf/pull/13870)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix List&#39;s missing children metadata in JSON writer ([#13869](https://github.com/rapidsai/cudf/pull/13869)) [@karthikeyann](https://github.com/karthikeyann)
+- Disable construction of Index when `freq` is set in pandas-compatibility mode ([#13857](https://github.com/rapidsai/cudf/pull/13857)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix an issue with fetching `NA` from a `TimedeltaColumn` ([#13853](https://github.com/rapidsai/cudf/pull/13853)) [@galipremsagar](https://github.com/galipremsagar)
+- Simplify implementation of interval_range() and fix behaviour for floating `freq` ([#13844](https://github.com/rapidsai/cudf/pull/13844)) [@shwina](https://github.com/shwina)
+- Fix binary operations between `Series` and `Index` ([#13842](https://github.com/rapidsai/cudf/pull/13842)) [@galipremsagar](https://github.com/galipremsagar)
+- Update make_lists_column_from_scalar to use make_offsets_child_column utility ([#13841](https://github.com/rapidsai/cudf/pull/13841)) [@davidwendt](https://github.com/davidwendt)
+- Fix read out of bounds in string concatenate ([#13838](https://github.com/rapidsai/cudf/pull/13838)) [@pentschev](https://github.com/pentschev)
+- Raise error for more cases when `timezone-aware` data is passed to `as_column` ([#13835](https://github.com/rapidsai/cudf/pull/13835)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix `any`, `all` reduction behavior for `axis=None` and warn for other reductions ([#13831](https://github.com/rapidsai/cudf/pull/13831)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise error when trying to construct time-zone aware timestamps ([#13830](https://github.com/rapidsai/cudf/pull/13830)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix cuFile I/O factories ([#13829](https://github.com/rapidsai/cudf/pull/13829)) [@vuule](https://github.com/vuule)
+- DataFrame with namedtuples uses ._field as column names ([#13824](https://github.com/rapidsai/cudf/pull/13824)) [@mroeschke](https://github.com/mroeschke)
+- Branch 23.10 merge 23.08 ([#13822](https://github.com/rapidsai/cudf/pull/13822)) [@vyasr](https://github.com/vyasr)
+- Return a Series from JIT GroupBy apply, rather than a DataFrame ([#13820](https://github.com/rapidsai/cudf/pull/13820)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- No need to dlsym EnsureS3Finalized we can call it directly ([#13819](https://github.com/rapidsai/cudf/pull/13819)) [@robertmaynard](https://github.com/robertmaynard)
+- Raise error when mixed types are being constructed ([#13816](https://github.com/rapidsai/cudf/pull/13816)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix unbounded sequence issue in `DataFrame` constructor ([#13811](https://github.com/rapidsai/cudf/pull/13811)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix Byte-Pair-Encoding usage of cuco static-map for storing merge-pairs ([#13807](https://github.com/rapidsai/cudf/pull/13807)) [@davidwendt](https://github.com/davidwendt)
+- Fix for Parquet writer when requested pages per row is smaller than fragment size ([#13806](https://github.com/rapidsai/cudf/pull/13806)) [@etseidl](https://github.com/etseidl)
+- Remove hangs from trying to construct un-bounded sequences ([#13799](https://github.com/rapidsai/cudf/pull/13799)) [@galipremsagar](https://github.com/galipremsagar)
+- Bug/update libcudf to handle arrow12 changes ([#13794](https://github.com/rapidsai/cudf/pull/13794)) [@robertmaynard](https://github.com/robertmaynard)
+- Update get_arrow to arrows 12 CMake target name of arrow::xsimd ([#13790](https://github.com/rapidsai/cudf/pull/13790)) [@robertmaynard](https://github.com/robertmaynard)
+- Raise error when trying to join `datetime` and `timedelta` types with other types ([#13786](https://github.com/rapidsai/cudf/pull/13786)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix negative unary operation for boolean type ([#13780](https://github.com/rapidsai/cudf/pull/13780)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix contains(`in`) method for `Series` ([#13779](https://github.com/rapidsai/cudf/pull/13779)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix binary operation column ordering and missing column issues ([#13778](https://github.com/rapidsai/cudf/pull/13778)) [@galipremsagar](https://github.com/galipremsagar)
+- Cast only time of day to nanos to avoid an overflow in Parquet INT96 write ([#13776](https://github.com/rapidsai/cudf/pull/13776)) [@gerashegalov](https://github.com/gerashegalov)
+- Preserve names of column object in various APIs ([#13772](https://github.com/rapidsai/cudf/pull/13772)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise error on constructing an array from mixed type inputs ([#13768](https://github.com/rapidsai/cudf/pull/13768)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix construction of DataFrames from dict when columns are provided ([#13766](https://github.com/rapidsai/cudf/pull/13766)) [@wence-](https://github.com/wence-)
+- Provide our own Cython declaration for make_unique ([#13746](https://github.com/rapidsai/cudf/pull/13746)) [@wence-](https://github.com/wence-)
+
+## 📖 Documentation
+
+- Fix typo in docstring: metadata. ([#14025](https://github.com/rapidsai/cudf/pull/14025)) [@bdice](https://github.com/bdice)
+- Fix typo in parquet/page_decode.cuh ([#13849](https://github.com/rapidsai/cudf/pull/13849)) [@XinyuZeng](https://github.com/XinyuZeng)
+- Simplify Python doc configuration ([#13826](https://github.com/rapidsai/cudf/pull/13826)) [@vyasr](https://github.com/vyasr)
+- Update documentation to reflect recent changes in JSON reader and writer ([#13791](https://github.com/rapidsai/cudf/pull/13791)) [@vuule](https://github.com/vuule)
+- Fix all warnings in Python docs ([#13789](https://github.com/rapidsai/cudf/pull/13789)) [@vyasr](https://github.com/vyasr)
+
+## 🚀 New Features
+
+- [Java] Add JNI bindings for `integers_to_hex` ([#14205](https://github.com/rapidsai/cudf/pull/14205)) [@razajafri](https://github.com/razajafri)
+- Propagate errors from Parquet reader kernels back to host ([#14167](https://github.com/rapidsai/cudf/pull/14167)) [@vuule](https://github.com/vuule)
+- JNI for `HISTOGRAM` and `MERGE_HISTOGRAM` aggregations ([#14154](https://github.com/rapidsai/cudf/pull/14154)) [@ttnghia](https://github.com/ttnghia)
+- Expose streams in all public sorting APIs ([#14146](https://github.com/rapidsai/cudf/pull/14146)) [@vyasr](https://github.com/vyasr)
+- Enable direct ingestion and production of Arrow scalars ([#14121](https://github.com/rapidsai/cudf/pull/14121)) [@vyasr](https://github.com/vyasr)
+- Implement `GroupBy.value_counts` to match pandas API ([#14114](https://github.com/rapidsai/cudf/pull/14114)) [@stmio](https://github.com/stmio)
+- Refactor parquet thrift reader ([#14097](https://github.com/rapidsai/cudf/pull/14097)) [@etseidl](https://github.com/etseidl)
+- Refactor `hash_reduce_by_row` ([#14095](https://github.com/rapidsai/cudf/pull/14095)) [@ttnghia](https://github.com/ttnghia)
+- Support negative preceding/following for ROW window functions ([#14093](https://github.com/rapidsai/cudf/pull/14093)) [@mythrocks](https://github.com/mythrocks)
+- Support for progressive parquet chunked reading. ([#14079](https://github.com/rapidsai/cudf/pull/14079)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Implement `HISTOGRAM` and `MERGE_HISTOGRAM` aggregations ([#14045](https://github.com/rapidsai/cudf/pull/14045)) [@ttnghia](https://github.com/ttnghia)
+- Expose streams in public search APIs ([#14034](https://github.com/rapidsai/cudf/pull/14034)) [@vyasr](https://github.com/vyasr)
+- Expose streams in public replace APIs ([#14010](https://github.com/rapidsai/cudf/pull/14010)) [@vyasr](https://github.com/vyasr)
+- Add stream parameter to public cudf::strings::split APIs ([#13997](https://github.com/rapidsai/cudf/pull/13997)) [@davidwendt](https://github.com/davidwendt)
+- Expose streams in public filling APIs ([#13990](https://github.com/rapidsai/cudf/pull/13990)) [@vyasr](https://github.com/vyasr)
+- Expose streams in public concatenate APIs ([#13987](https://github.com/rapidsai/cudf/pull/13987)) [@vyasr](https://github.com/vyasr)
+- Use HostMemoryAllocator in jni::allocate_host_buffer ([#13975](https://github.com/rapidsai/cudf/pull/13975)) [@gerashegalov](https://github.com/gerashegalov)
+- Enable fractional null probability for hashing benchmark ([#13967](https://github.com/rapidsai/cudf/pull/13967)) [@Blonck](https://github.com/Blonck)
+- Switch pylibcudf-enabled types to use enum class in Cython ([#13931](https://github.com/rapidsai/cudf/pull/13931)) [@vyasr](https://github.com/vyasr)
+- Add nvtext::tokenize_with_vocabulary API ([#13930](https://github.com/rapidsai/cudf/pull/13930)) [@davidwendt](https://github.com/davidwendt)
+- Rewrite `DataFrame.stack` to support multi level column names ([#13927](https://github.com/rapidsai/cudf/pull/13927)) [@isVoid](https://github.com/isVoid)
+- Add HostMemoryAllocator interface ([#13924](https://github.com/rapidsai/cudf/pull/13924)) [@gerashegalov](https://github.com/gerashegalov)
+- Global stream pool ([#13922](https://github.com/rapidsai/cudf/pull/13922)) [@etseidl](https://github.com/etseidl)
+- Create table_input_metadata from a table_metadata ([#13920](https://github.com/rapidsai/cudf/pull/13920)) [@etseidl](https://github.com/etseidl)
+- Translate column size overflow exception to JNI ([#13911](https://github.com/rapidsai/cudf/pull/13911)) [@mythrocks](https://github.com/mythrocks)
+- Enable RLE boolean encoding for v2 Parquet files ([#13886](https://github.com/rapidsai/cudf/pull/13886)) [@etseidl](https://github.com/etseidl)
+- Exclude some tests from running with the compute sanitizer ([#13872](https://github.com/rapidsai/cudf/pull/13872)) [@firestarman](https://github.com/firestarman)
+- Expand statistics support in ORC writer ([#13848](https://github.com/rapidsai/cudf/pull/13848)) [@vuule](https://github.com/vuule)
+- Register the memory mapped buffer in `datasource` to improve H2D throughput ([#13814](https://github.com/rapidsai/cudf/pull/13814)) [@vuule](https://github.com/vuule)
+- Add cudf::strings::find function with target per row ([#13808](https://github.com/rapidsai/cudf/pull/13808)) [@davidwendt](https://github.com/davidwendt)
+- Add minhash support for MurmurHash3_x64_128 ([#13796](https://github.com/rapidsai/cudf/pull/13796)) [@davidwendt](https://github.com/davidwendt)
+- Remove unnecessary pointer copying in JIT GroupBy Apply ([#13792](https://github.com/rapidsai/cudf/pull/13792)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add &#39;poll&#39; function to custreamz kafka consumer ([#13782](https://github.com/rapidsai/cudf/pull/13782)) [@jdye64](https://github.com/jdye64)
+- Support `corr` in `GroupBy.apply` through the jit engine ([#13767](https://github.com/rapidsai/cudf/pull/13767)) [@shwina](https://github.com/shwina)
+- Optionally write version 2 page headers in Parquet writer ([#13751](https://github.com/rapidsai/cudf/pull/13751)) [@etseidl](https://github.com/etseidl)
+- Support more numeric types in `Groupby.apply` with `engine=&#39;jit&#39;` ([#13729](https://github.com/rapidsai/cudf/pull/13729)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- [FEA] Add DELTA_BINARY_PACKED decoding support to Parquet reader ([#13637](https://github.com/rapidsai/cudf/pull/13637)) [@etseidl](https://github.com/etseidl)
+- Read FIXED_LEN_BYTE_ARRAY as binary in parquet reader ([#13437](https://github.com/rapidsai/cudf/pull/13437)) [@PointKernel](https://github.com/PointKernel)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for `23.10` release ([#14225](https://github.com/rapidsai/cudf/pull/14225)) [@galipremsagar](https://github.com/galipremsagar)
+- update rmm tag path ([#14195](https://github.com/rapidsai/cudf/pull/14195)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Disable `Recently Updated` Check ([#14193](https://github.com/rapidsai/cudf/pull/14193)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Move cpp/src/hash/hash_allocator.cuh to include/cudf/hashing/detail ([#14163](https://github.com/rapidsai/cudf/pull/14163)) [@davidwendt](https://github.com/davidwendt)
+- Add Parquet reader benchmarks for row selection ([#14147](https://github.com/rapidsai/cudf/pull/14147)) [@vuule](https://github.com/vuule)
+- Update image names ([#14145](https://github.com/rapidsai/cudf/pull/14145)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Support callables in DataFrame.assign ([#14142](https://github.com/rapidsai/cudf/pull/14142)) [@wence-](https://github.com/wence-)
+- Reduce memory usage of as_categorical_column ([#14138](https://github.com/rapidsai/cudf/pull/14138)) [@wence-](https://github.com/wence-)
+- Replace Python scalar conversions with libcudf ([#14124](https://github.com/rapidsai/cudf/pull/14124)) [@vyasr](https://github.com/vyasr)
+- Update to clang 16.0.6. ([#14120](https://github.com/rapidsai/cudf/pull/14120)) [@bdice](https://github.com/bdice)
+- Fix type of empty `Index` and raise warning in `Series` constructor ([#14116](https://github.com/rapidsai/cudf/pull/14116)) [@galipremsagar](https://github.com/galipremsagar)
+- Add stream parameter to external dict APIs ([#14115](https://github.com/rapidsai/cudf/pull/14115)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- Add fallback matrix for nvcomp. ([#14082](https://github.com/rapidsai/cudf/pull/14082)) [@bdice](https://github.com/bdice)
+- [Java] Add recoverWithNull to JSONOptions and pass to Table.readJSON ([#14078](https://github.com/rapidsai/cudf/pull/14078)) [@andygrove](https://github.com/andygrove)
+- Remove header tests ([#14072](https://github.com/rapidsai/cudf/pull/14072)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Refactor `contains_table` with cuco::static_set ([#14064](https://github.com/rapidsai/cudf/pull/14064)) [@PointKernel](https://github.com/PointKernel)
+- Remove debug print in a Parquet test ([#14063](https://github.com/rapidsai/cudf/pull/14063)) [@vuule](https://github.com/vuule)
+- Expose stream parameter in public nvtext ngram APIs ([#14061](https://github.com/rapidsai/cudf/pull/14061)) [@davidwendt](https://github.com/davidwendt)
+- Expose stream parameter in public strings find APIs ([#14060](https://github.com/rapidsai/cudf/pull/14060)) [@davidwendt](https://github.com/davidwendt)
+- Update doxygen to 1.9.1 ([#14059](https://github.com/rapidsai/cudf/pull/14059)) [@vyasr](https://github.com/vyasr)
+- Remove the mr from the base fixture ([#14057](https://github.com/rapidsai/cudf/pull/14057)) [@vyasr](https://github.com/vyasr)
+- Expose streams in public strings case APIs ([#14056](https://github.com/rapidsai/cudf/pull/14056)) [@davidwendt](https://github.com/davidwendt)
+- Refactor libcudf indexalator to typed normalator ([#14043](https://github.com/rapidsai/cudf/pull/14043)) [@davidwendt](https://github.com/davidwendt)
+- Use cudf::make_empty_column instead of column_view constructor ([#14030](https://github.com/rapidsai/cudf/pull/14030)) [@davidwendt](https://github.com/davidwendt)
+- Remove quadratic runtime due to accessing Frame._dtypes in loop ([#14028](https://github.com/rapidsai/cudf/pull/14028)) [@wence-](https://github.com/wence-)
+- Explicitly depend on zlib in conda recipes ([#14018](https://github.com/rapidsai/cudf/pull/14018)) [@wence-](https://github.com/wence-)
+- Use grid_stride for stride computations. ([#13996](https://github.com/rapidsai/cudf/pull/13996)) [@bdice](https://github.com/bdice)
+- Fix an issue where casting null-array to `object` dtype will result in a failure ([#13994](https://github.com/rapidsai/cudf/pull/13994)) [@galipremsagar](https://github.com/galipremsagar)
+- Add tab as literal to cudf::test::to_string output ([#13993](https://github.com/rapidsai/cudf/pull/13993)) [@davidwendt](https://github.com/davidwendt)
+- Enable `codes` dtype parity in pandas-compatibility mode for `factorize` API ([#13982](https://github.com/rapidsai/cudf/pull/13982)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix `CategoricalIndex` ordering in `Groupby.agg` when pandas-compatibility mode is enabled ([#13978](https://github.com/rapidsai/cudf/pull/13978)) [@galipremsagar](https://github.com/galipremsagar)
+- Produce a fatal error if cudf is unable to find pyarrow include directory ([#13976](https://github.com/rapidsai/cudf/pull/13976)) [@cwharris](https://github.com/cwharris)
+- Use `thread_index_type` in `partitioning.cu` ([#13973](https://github.com/rapidsai/cudf/pull/13973)) [@divyegala](https://github.com/divyegala)
+- Use `cudf::thread_index_type` in `merge.cu` ([#13972](https://github.com/rapidsai/cudf/pull/13972)) [@divyegala](https://github.com/divyegala)
+- Use `copy-pr-bot` ([#13970](https://github.com/rapidsai/cudf/pull/13970)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Use cudf::thread_index_type in strings custom kernels ([#13968](https://github.com/rapidsai/cudf/pull/13968)) [@davidwendt](https://github.com/davidwendt)
+- Add `bytes_per_second` to hash_partition benchmark ([#13965](https://github.com/rapidsai/cudf/pull/13965)) [@Blonck](https://github.com/Blonck)
+- Added pinned pool reservation API for java ([#13964](https://github.com/rapidsai/cudf/pull/13964)) [@revans2](https://github.com/revans2)
+- Simplify wheel build scripts and allow alphas of RAPIDS dependencies ([#13963](https://github.com/rapidsai/cudf/pull/13963)) [@vyasr](https://github.com/vyasr)
+- Add `bytes_per_second` to copy_if_else benchmark ([#13960](https://github.com/rapidsai/cudf/pull/13960)) [@Blonck](https://github.com/Blonck)
+- Add pandas compatible output to `Series.unique` ([#13959](https://github.com/rapidsai/cudf/pull/13959)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `bytes_per_second` to compiled binaryop benchmark ([#13938](https://github.com/rapidsai/cudf/pull/13938)) [@Blonck](https://github.com/Blonck)
+- Unpin `dask` and `distributed` for `23.10` development ([#13935](https://github.com/rapidsai/cudf/pull/13935)) [@galipremsagar](https://github.com/galipremsagar)
+- Make HostColumnVector.getRefCount public ([#13934](https://github.com/rapidsai/cudf/pull/13934)) [@abellina](https://github.com/abellina)
+- Use cuco::static_set in JSON tree algorithm ([#13928](https://github.com/rapidsai/cudf/pull/13928)) [@karthikeyann](https://github.com/karthikeyann)
+- Add java API to get size of host memory needed to copy column view ([#13919](https://github.com/rapidsai/cudf/pull/13919)) [@revans2](https://github.com/revans2)
+- Use cudf::size_type instead of int32 where appropriate in nvtext functions ([#13915](https://github.com/rapidsai/cudf/pull/13915)) [@davidwendt](https://github.com/davidwendt)
+- Enable hugepage for arrow host allocations ([#13914](https://github.com/rapidsai/cudf/pull/13914)) [@madsbk](https://github.com/madsbk)
+- Improve performance of nvtext::edit_distance ([#13912](https://github.com/rapidsai/cudf/pull/13912)) [@davidwendt](https://github.com/davidwendt)
+- Ensure cudf internals use pylibcudf in pure Python mode ([#13909](https://github.com/rapidsai/cudf/pull/13909)) [@vyasr](https://github.com/vyasr)
+- Use `empty()` instead of `size()` where possible ([#13908](https://github.com/rapidsai/cudf/pull/13908)) [@vuule](https://github.com/vuule)
+- [JNI] Adds HostColumnVector.EventHandler for spillability checks ([#13898](https://github.com/rapidsai/cudf/pull/13898)) [@abellina](https://github.com/abellina)
+- Return `Timestamp` &amp; `Timedelta` for fetching scalars in `DatetimeIndex` &amp; `TimedeltaIndex` ([#13896](https://github.com/rapidsai/cudf/pull/13896)) [@galipremsagar](https://github.com/galipremsagar)
+- Allow explicit `shuffle=&quot;p2p&quot;` within dask-cudf API ([#13893](https://github.com/rapidsai/cudf/pull/13893)) [@rjzamora](https://github.com/rjzamora)
+- Disable creation of `DatetimeIndex` when `freq` is passed to `cudf.date_range` ([#13890](https://github.com/rapidsai/cudf/pull/13890)) [@galipremsagar](https://github.com/galipremsagar)
+- Bring parity with pandas for `datetime` &amp; `timedelta` comparison operations ([#13877](https://github.com/rapidsai/cudf/pull/13877)) [@galipremsagar](https://github.com/galipremsagar)
+- Change `NA` to `NaT` for `datetime` and `timedelta` types ([#13868](https://github.com/rapidsai/cudf/pull/13868)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise error when `astype(object)` is called in pandas compatibility mode ([#13862](https://github.com/rapidsai/cudf/pull/13862)) [@galipremsagar](https://github.com/galipremsagar)
+- Fixes a performance regression in FST ([#13850](https://github.com/rapidsai/cudf/pull/13850)) [@elstehle](https://github.com/elstehle)
+- Set native handles to null on close in Java wrapper classes ([#13818](https://github.com/rapidsai/cudf/pull/13818)) [@jlowe](https://github.com/jlowe)
+- Avoid use of CUDF_EXPECTS in libcudf unit tests outside of helper functions with return values ([#13812](https://github.com/rapidsai/cudf/pull/13812)) [@vuule](https://github.com/vuule)
+- Update `lists::contains` to experimental row comparator ([#13810](https://github.com/rapidsai/cudf/pull/13810)) [@divyegala](https://github.com/divyegala)
+- Reduce `lists::contains` dispatches for scalars ([#13805](https://github.com/rapidsai/cudf/pull/13805)) [@divyegala](https://github.com/divyegala)
+- Long string optimization for string column parsing in JSON reader ([#13803](https://github.com/rapidsai/cudf/pull/13803)) [@karthikeyann](https://github.com/karthikeyann)
+- Raise NotImplementedError for pd.SparseDtype ([#13798](https://github.com/rapidsai/cudf/pull/13798)) [@mroeschke](https://github.com/mroeschke)
+- Remove the libcudf cudf::offset_type type ([#13788](https://github.com/rapidsai/cudf/pull/13788)) [@davidwendt](https://github.com/davidwendt)
+- Move Spark-indpendent Table debug to cudf Java ([#13783](https://github.com/rapidsai/cudf/pull/13783)) [@gerashegalov](https://github.com/gerashegalov)
+- Update to Cython 3.0.0 ([#13777](https://github.com/rapidsai/cudf/pull/13777)) [@vyasr](https://github.com/vyasr)
+- Refactor Parquet reader handling of V2 page header info ([#13775](https://github.com/rapidsai/cudf/pull/13775)) [@etseidl](https://github.com/etseidl)
+- Branch 23.10 merge 23.08 ([#13773](https://github.com/rapidsai/cudf/pull/13773)) [@vyasr](https://github.com/vyasr)
+- Restructure JSON code to correctly reflect legacy/experimental status ([#13757](https://github.com/rapidsai/cudf/pull/13757)) [@vuule](https://github.com/vuule)
+- Branch 23.10 merge 23.08 ([#13753](https://github.com/rapidsai/cudf/pull/13753)) [@vyasr](https://github.com/vyasr)
+- Enforce deprecations in `23.10` ([#13732](https://github.com/rapidsai/cudf/pull/13732)) [@galipremsagar](https://github.com/galipremsagar)
+- Upgrade to arrow 12 ([#13728](https://github.com/rapidsai/cudf/pull/13728)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactors JSON reader&#39;s pushdown automaton ([#13716](https://github.com/rapidsai/cudf/pull/13716)) [@elstehle](https://github.com/elstehle)
+- Remove Arrow dependency from the `datasource.hpp` public header ([#13698](https://github.com/rapidsai/cudf/pull/13698)) [@vuule](https://github.com/vuule)
+
 # cuDF 23.08.00 (9 Aug 2023)
 
 ## 🚨 Breaking Changes